In [1]:

# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:

from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)

In [3]:

metadata = lsl_dr_project.export_metadata()

In [4]:

# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:

articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})

In [6]:

records = lsl_dr_project.export_records(fields=articulation_fields)

In [7]:

print(records[0]['study_id'])

0101-2003-0101

In [8]:

expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', df_kwargs={'index_col':None,
                                                                                             'na_values':[999, 9999]})

In [9]:

receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', df_kwargs={'index_col':None,
                                                                                             'na_values':[999, 9999]})

In [10]:

language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', df_kwargs={'index_col':None,
                                                                                             'na_values':[999, 9999]})

In [11]:

demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[999, 9999]})

Attendance information¶

Several fields in the demographic data have missing values.

In [12]:

demographic_raw.head()

Out[12]:

	study_id	redcap_event_name	academic_year	gender	race	prim_lang	sib	mother_ed	father_ed	...	sle_fo	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid
0	0101-2003-0101	initial_assessment_arm_1	2002-2003	0	0	0	1	6	6	...	2	2	54	2	NaN	NaN	NaN	NaN	NaN	NaN
1	0101-2003-0101	year_1_complete_71_arm_1	2003-2004	NaN	NaN	NaN	NaN	NaN	NaN	...	4	4	80	1	NaN	NaN	NaN	NaN	NaN	NaN
2	0101-2003-0101	year_2_complete_71_arm_1	2004-2005	NaN	NaN	NaN	NaN	NaN	NaN	...	4	4	80	2	NaN	NaN	NaN	NaN	NaN	NaN
3	0101-2003-0101	year_3_complete_71_arm_1	2005-2006	NaN	NaN	NaN	NaN	NaN	NaN	...	5	5	96	3	NaN	NaN	NaN	NaN	NaN	NaN
4	0101-2003-0101	year_4_complete_71_arm_1	2006-2007	NaN	NaN	NaN	NaN	NaN	NaN	...	5	5	109	2	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 46 columns

We can fill missing values forward from previous observation (by study_id)

In [13]:

demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill')).reset_index()
demographic["study_id"] = demographic_raw.study_id

Demographic data without missing values:

In [14]:

demographic.head()

Out[14]:

	index	redcap_event_name	academic_year	gender	race	prim_lang	sib	mother_ed	father_ed	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
0	699	initial_assessment_arm_1	NaN	0	0	6	3	5	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0101-2003-0101
1	9715	initial_assessment_arm_1	2009-2010	1	8	0	0	6	6	...	3	50	0	NaN	NaN	NaN	NaN	NaN	NaN	0101-2003-0101
2	9719	initial_assessment_arm_1	2009-2010	1	6	0	1	6	6	...	3	32	0	NaN	NaN	NaN	NaN	NaN	NaN	0101-2003-0101
3	9723	initial_assessment_arm_1	2009-2010	0	6	0	1	4	4	...	4	25	0	NaN	NaN	NaN	NaN	NaN	NaN	0101-2003-0101
4	946	initial_assessment_arm_1	2011-2012	1	1	0	0	6	6	...	0	30	3	NaN	NaN	NaN	NaN	NaN	NaN	0101-2003-0101

5 rows × 47 columns

Cleaning languge dataset¶

5 language measures:

3 versions of CELF
PLS
- pls_ac_rs: PLS: Auditory Comprehension Raw Score
- pls_ac_ss: PLS: Auditory Comprehension Standard Score
- pls_ec_rs: PLS: Expressive Communication Raw Score
- pls_ec_ss: PLS: Expressive Communication Standard Score
- pls_tl_rs: PLS: Total Language Score Standard Score Total
- pls_tl_ss: PLS: Total Language Score Standard Score
OWLS
- age_test_owls: Age at time of testing (OWLS)
- owls_lc_rs: OWLS: Listening Comprehension Raw Score
- owls_lc_ss: OWLS: Listening Comprehension Standard Score
- owls_oe_rs: OWLS: Oral Expression Raw Score
- owls_oe_ss: OWLS: Oral Expression Standard Score
- owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
- owls_oc_ss: OWLS: Oral Composite Standard Score
- owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
- owls_wes_as: OWLS: Written Expression Scale Ability Score
- owls_wes_ss: OWLS: Written Expression Scale Standard Score
- owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
- owls_lcss: OWLS: Language Composite Standard Score

In [15]:

# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))

test_type  expressive  receptive
test_name                       
CELF-4            539        489
CELF-P2          1168       1174
OWLS              867        873
PLS              2884       2893
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [16]:

language["school"] = language.study_id.str.slice(0,4)

In [17]:

language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()

Out[17]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain
0	0101-2003-0101	initial_assessment_arm_1	51	receptive	PLS	0101	54	Language
5	0101-2003-0101	year_5_complete_71_arm_1	61	receptive	OWLS	0101	113	Language
9	0101-2003-0102	initial_assessment_arm_1	55	receptive	PLS	0101	44	Language
10	0101-2003-0102	year_1_complete_71_arm_1	77	receptive	PLS	0101	54	Language
11	0101-2003-0102	year_2_complete_71_arm_1	93	receptive	CELF-P2	0101	68	Language

Cleaning articulation dataset¶

We converted the articulation dataset into a "long" format:

In [18]:

# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]

Goldman                 4250
Arizonia                 485
Arizonia and Goldman      49
dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [19]:

articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [20]:

articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())

count    4781.000000
mean       69.179042
std        31.232998
min        23.000000
25%        47.000000
50%        60.000000
75%        81.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [21]:

articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()

Out[21]:

	study_id	redcap_event_name	test_type	score	school	age_test	domain
1	0101-2003-0101	year_1_complete_71_arm_1	Goldman	78	0101	80	Articulation
9	0101-2003-0102	initial_assessment_arm_1	Goldman	72	0101	44	Articulation
10	0101-2003-0102	year_1_complete_71_arm_1	Goldman	97	0101	54	Articulation
14	0101-2004-0101	year_2_complete_71_arm_1	Goldman	75	0101	53	Articulation
15	0101-2004-0101	year_3_complete_71_arm_1	Goldman	80	0101	66	Articulation

Cleaning demographic dataset¶

We excluded unwanted columns and rows for which age, gender or race were missing:

In [22]:

# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [23]:

demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))

False    9667
True     2087
dtype: int64
There are 714 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

0=no high school diploma
1=high school
2=undergraduate
3=graduate

Category 6 (unknown) was recoded as missing.

In [24]:

demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))

_mother_ed:
6    4298
4    2483
3    1682
5    1280
2    1113
1     421
0     156
dtype: int64
mother_ed:
1    2795
2    2483
3    1280
0     577
dtype: int64

There are 5333 null values for mother_ed

Secondary diagnosis

In [25]:

demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None

In [26]:

demographic.secondary_diagnosis.value_counts()

Out[26]:

0    9122
1    2120
dtype: int64

In [27]:

demographic.secondary_diagnosis.mean()

Out[27]:

0.18857854474292832

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [28]:

demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))

There are 3179 null values for premature_weeks

In [29]:

demographic.premature_weeks.value_counts()

Out[29]:

0     7871
2      486
4      324
12     180
6      159
10     125
8      104
14      38
16       2
dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [30]:

tech_cats = ["None", "Baha", "Hearing aid", "Cochlear"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.tech_left = np.abs(demographic.tech_left - 3)

Substitute valid missing values for hearing loss:

In [31]:

demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [32]:

demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

0=none
1=one ear
2=both ears.

In [33]:

demographic["baha"] = 0
demographic.baha = demographic.baha.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'baha'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'baha'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'baha'] = None
print("baha:")
print(demographic.drop_duplicates(subset='study_id').baha.value_counts())
print("There are {0} null values for baha".format(sum(demographic.baha.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))

baha:
0    4121
1     136
2      67
dtype: int64
There are 1476 null values for baha

hearing_aid:
0    1902
2    1723
1     679
dtype: int64
There are 1516 null values for hearing_aid

cochlear:
0    2430
2    1112
1     782
dtype: int64
There are 1476 null values for cochlear
12468

Identify bilateral and bimodal individuals:

In [34]:

demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)

In [35]:

demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum()

Out[35]:

(2934, 4333, 1220)

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [36]:

demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))

There are 0 null values for tech

In [37]:

demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.baha==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.baha==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.baha==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()

Out[37]:

6    4333
3    2934
4    1220
0     680
1     470
2     293
8     168
7      19
5       8
dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [38]:

demographic.onset_1.unique()

Out[38]:

array([   nan,    4. ,   14. ,   24. ,   19. ,    0. ,   41. ,    1. ,
         17. ,    5. ,   13. ,   30. ,   43. ,   16. ,   32. ,    8. ,
          9. ,   25. ,    2. ,    3. ,   33. ,   34. ,   36. ,    6. ,
         40. ,   31. ,   22. ,   39. ,    7. ,   12. ,   42. ,   15. ,
         18. ,   57. ,   26. ,   28. ,   52. ,   11. ,   59. ,   29. ,
         23. ,   10. ,   21. ,   38. ,   20. ,   46. ,   37. ,   47. ,
         60. ,   48. ,    1.5,   56. ,   44. ,   35. ,   27. ,   86. ,
         64. ,   58. ,   70. ,   49. ,    0.5,   51. ,   55. ,   50. ,
        119. ,   72. ,   88. ,   65. ,   66. ,   54. ,  116. ,   78. ,
         83. ,   61. ,  107. ,   74. ,   77. ,   62. ,   53. ,   63. ,
         84. ,  140. ,   80. ,  126. ,   85. ,  133. ,   81. ,  103. ,
         87. ,   76. ,   45. ,   68. ,   92. ,   67. ,    2.5,   97. ,
         71. ,   75. ,   98. ,  152. ,   89. ,  154. ])

In [39]:

# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [40]:

demographic.age_diag.isnull().sum()

Out[40]:

In [115]:

demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})

In [144]:

import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

ag = sb.factorplot("sex", data=unique_students, 
              palette="PuBuGn_d", kind='count')
ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
                    'Male ({})'.format((unique_students.male==1).sum())])
ag.set_xlabels('')

Out[144]:

<seaborn.axisgrid.FacetGrid at 0x11c654d68>

Child has another diagnosed disability

In [41]:

demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None

In [42]:

# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [43]:

demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [44]:

races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column

_race:
0    6519
2    2000
1    1155
3     861
6     587
8     463
7     223
4      58
5      25
dtype: int64
race:
0    6519
2    2000
1    1155
4    1133
3     861
dtype: int64
There are 800 null values for race

Recode implant technology variables

In [45]:

tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)

In [46]:

# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)

In [47]:

demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013',
                                   '0000-0000': np.nan})

In [48]:

demographic['academic_year_start'] = demographic.academic_year.apply(lambda x: str(x).strip()[:4]).value_counts()

In [49]:

demographic.age_amp.hist()

Out[49]:

<matplotlib.axes._subplots.AxesSubplot at 0x117c634e0>

Cleaning expressive vocabulary dataset¶

We converted the expressive vocabulary dataset to "long" format:

In [50]:

# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]

There are 0 null values for test_type

In [51]:

expressive.test_type.value_counts()

Out[51]:

EVT               3109
EOWPVT            2300
EOWPVT and EVT     120
dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [52]:

expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [53]:

expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [54]:

expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()

Out[54]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
0	0101-2003-0101	initial_assessment_arm_1	58	EOWPVT	0101	54	Expressive Vocabulary
2	0101-2003-0101	year_2_complete_71_arm_1	84	EOWPVT	0101	80	Expressive Vocabulary
5	0101-2003-0101	year_5_complete_71_arm_1	90	EOWPVT	0101	113	Expressive Vocabulary
14	0101-2004-0101	year_2_complete_71_arm_1	90	EOWPVT	0101	53	Expressive Vocabulary
15	0101-2004-0101	year_3_complete_71_arm_1	87	EOWPVT	0101	66	Expressive Vocabulary

Cleaning receptive vocabulary dataset¶

We converted the receptive vocabulary data table to "long" format:

In [55]:

receptive.columns

Out[55]:

Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')

In [56]:

# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]

There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [57]:

receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [58]:

receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]

In [59]:

print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))

There are 28 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [60]:

receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()

Out[60]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
2	0101-2003-0101	year_2_complete_71_arm_1	90	PPVT	0101	80	Receptive Vocabulary
5	0101-2003-0101	year_5_complete_71_arm_1	101	ROWPVT	0101	113	Receptive Vocabulary
9	0101-2003-0102	initial_assessment_arm_1	55	PPVT	0101	44	Receptive Vocabulary
10	0101-2003-0102	year_1_complete_71_arm_1	80	PPVT	0101	54	Receptive Vocabulary
11	0101-2003-0102	year_2_complete_71_arm_1	101	PPVT	0101	68	Receptive Vocabulary

Merge datasets¶

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [61]:

test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [62]:

lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')

In [63]:

lsl_dr.tail()

Out[63]:

	index	redcap_event_name	academic_year	male	_race	prim_lang	sib	_mother_ed	father_ed	...	synd_or_disab	race	academic_year_start	age_test	domain	school	score	test_name	test_type
25269	10878	year_9_complete_71_arm_1	2008-2009	1	0	0	1	3	2	...	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25270	1449	year_9_complete_71_arm_1	2013-2014	1	0	0	1	NaN	NaN	...	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25271	1440	year_9_complete_71_arm_1	2013-2014	1	2	0	NaN	6	6	...	0	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25272	244	year_9_complete_71_arm_1	2013-2014	1	0	6	2	6	6	...	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25273	4822	year_9_complete_71_arm_1	2012-2013	1	0	0	3	4	4	...	1	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 73 columns

In [64]:

lsl_dr.score.hist(bins=50)

Out[64]:

<matplotlib.axes._subplots.AxesSubplot at 0x118085ba8>

Export dataset

In [65]:

lsl_dr.to_csv('lsl_dr.csv')

In [66]:

lsl_dr.shape

Out[66]:

(25274, 73)

In [67]:

lsl_dr.study_id.unique().shape

Out[67]:

(4893,)

In [68]:

demographic.study_id.unique().shape

Out[68]:

(4893,)

Convert score to floating-point number

In [69]:

lsl_dr.score = lsl_dr.score.astype(float)

Plots of Demographic Data¶

In [70]:

plot_color = "#64AAE8"

In [71]:

def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, **kwargs):
    fig, ax = plt.subplots()
    counts = series.value_counts().sort_index(1)
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        plt.xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    for i,x in enumerate(counts):
        plt.annotate('%i' % x, (i, x + label_offset))
        
    fig.tight_layout()

In [72]:

unique_students = demographic.drop_duplicates('study_id')

In [73]:

unique_students.shape

Out[73]:

(4893, 67)

In [74]:

unique_students.age.describe()

Out[74]:

count    4650.000000
mean       31.476559
std        27.658987
min         0.000000
25%        10.000000
50%        26.000000
75%        42.000000
max       221.000000
Name: age, dtype: float64

In [75]:

plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2600), color=plot_color)

In [76]:

plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)

In [77]:

plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)

In [78]:

amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))

There are 4537 null values for age_amp

In [79]:

age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))

In [80]:

age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))

In [81]:

(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')

Out[81]:

<matplotlib.text.Text at 0x116eb8438>

In [82]:

plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)

In [83]:

plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))

In [84]:

degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)

In [85]:

plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, color=plot_color, ylim=(0,2000))

In [86]:

type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)

In [87]:

plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)

In [88]:

def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    return summary[['Sample Size','Mean','SD','Min','Max']]

In [89]:

receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary

Out[89]:

	Sample Size	Mean	SD	Min	Max
age_test
2	365	89.956164	19.590400	40	143
3	1088	91.164522	18.890610	40	139
4	770	88.429870	20.195620	20	137
5	532	81.971805	18.450145	20	120
6	270	81.548148	14.985603	40	110
7	194	81.747423	14.273898	49	130
8	153	74.856209	16.668761	40	120
9	98	68.153061	17.462981	34	93
10	80	73.562500	14.548778	41	100
11	63	64.698413	13.498668	43	107

In [90]:

receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)

Out[90]:

(-0.5, 9.5)

In [91]:

expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary

Out[91]:

	Sample Size	Mean	SD	Min	Max
age_test
2	381	90.889764	22.975237	50	141
3	1032	92.033915	22.110072	39	145
4	725	88.816552	23.213417	0	140
5	476	83.004202	19.098415	26	125
6	253	81.833992	17.057678	20	111
7	187	83.962567	15.558205	49	115
8	154	81.188312	14.109177	55	114
9	81	75.086420	18.305872	41	105
10	85	77.847059	14.929601	52	110
11	68	74.102941	10.905482	57	99

In [92]:

expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
plt.ylim(0, 1100)

Out[92]:

(0, 1100)

In [93]:

articulation_summary = score_summary("Articulation")
articulation_summary

Out[93]:

	Sample Size	Mean	SD	Min	Max
age_test
2	310	84.058065	16.079390	52	122
3	870	81.024138	18.826887	40	126
4	630	81.941270	19.455172	40	117
5	498	77.323293	21.931068	39	112
6	268	73.100746	24.007043	39	110
7	173	79.236994	20.594829	40	108
8	128	79.210938	20.123498	40	107
9	70	75.271429	20.355480	40	106
10	43	78.046512	21.542818	40	105
11	97	85.494845	15.293738	39	104

In [94]:

sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

Language scores

In [95]:

lsl_dr.domain.unique()

Out[95]:

array(['Expressive Vocabulary', 'Language', 'Articulation',
       'Receptive Vocabulary', nan], dtype=object)

In [96]:

lsl_dr.test_type.unique()

Out[96]:

array(['EOWPVT', 'receptive', 'expressive', 'Goldman', 'PPVT', nan,
       'Arizonia', 'EVT', 'ROWPVT', 'Arizonia and Goldman',
       'EOWPVT and EVT', 'PPVT and ROWPVT'], dtype=object)

In [97]:

receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary

Out[97]:

	Sample Size	Mean	SD	Min	Max
age_test
2	891	80.783389	22.020807	50	150
3	944	81.379237	20.532473	50	131
4	661	80.316188	20.072630	43	132
5	457	78.770241	18.782742	49	127
6	222	71.225225	17.404782	45	115
7	148	75.824324	16.579931	41	108
8	153	67.764706	17.823611	40	123
9	14	52.642857	17.367552	40	82
10	16	69.437500	9.244593	58	88
11	20	82.000000	27.868110	40	130

In [98]:

sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

In [99]:

expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary

Out[99]:

	Sample Size	Mean	SD	Min	Max
age_test
2	884	85.030543	17.776904	50	150
3	948	79.734177	17.122609	50	135
4	652	76.124233	19.010666	48	126
5	469	72.812367	19.812556	45	127
6	221	63.923077	15.833465	44	103
7	156	66.576923	19.150689	45	114
8	155	59.948387	20.547630	40	112
9	14	48.357143	13.964830	40	79
10	16	68.250000	16.699301	40	107
11	20	81.750000	29.889049	40	128

In [100]:

sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);

In [101]:

(unique_students.age/12.).hist(grid=False, bins=np.sqrt(unique_students.shape[0]))
plt.ylabel('Count')
plt.xlabel('Age at enrollment')

Out[101]:

<matplotlib.text.Text at 0x114f86588>

In [102]:

(unique_students.age/12.).describe()

Out[102]:

count    4650.000000
mean        2.623047
std         2.304916
min         0.000000
25%         0.833333
50%         2.166667
75%         3.500000
max        18.416667
Name: age, dtype: float64

In [103]:

def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})

In [104]:

audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())

In [105]:

plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')

Out[105]:

<matplotlib.text.Text at 0x114170048>

In [106]:

slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())

In [107]:

plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')

Out[107]:

<matplotlib.text.Text at 0x114f84518>

In [108]:

sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())

In [109]:

plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')

Out[109]:

<matplotlib.text.Text at 0x11418bb00>

In [110]:

lsl_dr.degree_hl.dropna().value_counts()

Out[110]:

6    10446
3     3127
4     3041
5     2705
2     1416
0      549
1      170
dtype: int64

In [111]:

ax = lsl_dr.degree_hl.hist(bins=7)

In [112]:

diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)

Out[112]:

<matplotlib.axes._subplots.AxesSubplot at 0x113fad438>

In [113]:

(lsl_dr.age_int<6).mean()

Out[113]:

0.1993352852734035

In [114]:

(lsl_dr.age<6).mean()

Out[114]:

0.15173696288676111