In [1]:

# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:

DATA_DIR = '../data/clean/'

Connect to database to import data for the three test domains and demographic information:

In [3]:

from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)

In [4]:

metadata = lsl_dr_project.export_metadata()

Import each database from REDCap:

In [5]:

articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None,
                                                                                                'na_values':[999, 9999]})

In [6]:

records = lsl_dr_project.export_records(fields=articulation_fields)

In [7]:

print(records[0]['study_id'])

0101-2002-0101

In [8]:

expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})

In [9]:

receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})

In [10]:

language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf',
                   'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',
                   'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',
                   'celfp_wct_ss']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})

In [11]:

demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year_rv',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age', 'parent_hl', 'med_cause', 'known_synd', 'school_grade',
'att_days_hr', 'att_days_sch', 'att_days_st2_417', 'optionserv_type', 'option_pop', 'otherserv']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})

/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2802: DtypeWarning: Columns (27,33) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):

Attendance information¶

Several fields in the demographic data have missing values.

In [12]:

demographic_raw.head()

Out[12]:

	study_id	redcap_event_name	academic_year_rv	gender	race	prim_lang	sib	mother_ed	father_ed	...	sle_fo	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid
0	0101-2002-0101	initial_assessment_arm_1	2002.0	0.0	0.0	0.0	1.0	6.0	6.0	...	2.0	2.0	54.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN
1	0101-2002-0101	year_1_complete_71_arm_1	2003.0	NaN	NaN	NaN	NaN	NaN	NaN	...	4.0	4.0	80.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN
2	0101-2002-0101	year_2_complete_71_arm_1	2004.0	NaN	NaN	NaN	NaN	NaN	NaN	...	4.0	4.0	80.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN
3	0101-2002-0101	year_3_complete_71_arm_1	2005.0	NaN	NaN	NaN	NaN	NaN	NaN	...	5.0	5.0	96.0	3.0	NaN	NaN	NaN	NaN	NaN	NaN
4	0101-2002-0101	year_4_complete_71_arm_1	2006.0	NaN	NaN	NaN	NaN	NaN	NaN	...	5.0	5.0	109.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 56 columns

We can fill missing values forward from previous observation (by study_id)

In [16]:

demographic = demographic_raw.sort_values(by='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort_values(by='redcap_event_name').study_id

Random check to make sure this worked

In [17]:

demographic[demographic.study_id=='1147-2010-0064']

Out[17]:

	redcap_event_name	academic_year_rv	sib	mother_ed	father_ed	parent_hl	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
17762	initial_assessment_arm_1	2010.0	1.0	3.0	3.0	NaN	...	6.0	65.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
17763	year_1_complete_71_arm_1	2011.0	1.0	3.0	3.0	NaN	...	5.0	77.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
17764	year_2_complete_71_arm_1	2012.0	1.0	3.0	3.0	NaN	...	5.0	89.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064
17765	year_3_complete_71_arm_1	2013.0	1.0	3.0	3.0	NaN	...	5.0	101.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	1147-2010-0064

4 rows × 56 columns

Demographic data without missing values:

In [18]:

demographic.head()

Out[18]:

	redcap_event_name	academic_year_rv	gender	race	sib	mother_ed	father_ed	parent_hl	...	a_fo	fam_age	family_inv	att_days_sch	att_days_st2_417	att_days_hr	demo_ses	school_lunch	medicaid	study_id
0	initial_assessment_arm_1	2002.0	0.0	0.0	1.0	6.0	6.0	0.0	...	2.0	54.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	0101-2002-0101
9458	initial_assessment_arm_1	2009.0	1.0	2.0	1.0	3.0	3.0	0.0	...	5.0	135.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	0628-1999-0506
9463	initial_assessment_arm_1	2009.0	0.0	2.0	1.0	2.0	0.0	0.0	...	4.0	175.0	3.0	NaN	NaN	NaN	NaN	NaN	NaN	0628-2000-0308
9469	initial_assessment_arm_1	2009.0	0.0	6.0	1.0	3.0	5.0	NaN	...	6.0	100.0	0.0	NaN	NaN	NaN	NaN	NaN	NaN	0628-2001-0034
9473	initial_assessment_arm_1	2009.0	0.0	2.0	2.0	2.0	2.0	0.0	...	4.0	154.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	0628-2001-0289

5 rows × 56 columns

Cleaning languge dataset¶

5 language measures:

3 versions of CELF
PLS
- pls_ac_rs: PLS: Auditory Comprehension Raw Score
- pls_ac_ss: PLS: Auditory Comprehension Standard Score
- pls_ec_rs: PLS: Expressive Communication Raw Score
- pls_ec_ss: PLS: Expressive Communication Standard Score
- pls_tl_rs: PLS: Total Language Score Standard Score Total
- pls_tl_ss: PLS: Total Language Score Standard Score
OWLS
- age_test_owls: Age at time of testing (OWLS)
- owls_lc_rs: OWLS: Listening Comprehension Raw Score
- owls_lc_ss: OWLS: Listening Comprehension Standard Score
- owls_oe_rs: OWLS: Oral Expression Raw Score
- owls_oe_ss: OWLS: Oral Expression Standard Score
- owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
- owls_oc_ss: OWLS: Oral Composite Standard Score
- owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
- owls_wes_as: OWLS: Written Expression Scale Ability Score
- owls_wes_ss: OWLS: Written Expression Scale Standard Score
- owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
- owls_lcss: OWLS: Language Composite Standard Score

In [19]:

# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))

test_type  expressive  receptive
test_name                       
CELF-4            682        584
CELF-P2          1927       1933
OWLS             1370       1376
PLS              4603       4613
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [20]:

language["school"] = language.study_id.str.slice(0,4)

In [21]:

language_subtest = language[["study_id", "redcap_event_name", "score", "test_type", 
                             "test_name", "school", "age_test", 
                             'celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']]

In [22]:

language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()

Out[22]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain
0	0101-2002-0101	initial_assessment_arm_1	51	receptive	PLS	0101	54	Language
5	0101-2002-0101	year_5_complete_71_arm_1	61	receptive	OWLS	0101	113	Language
9	0101-2003-0102	initial_assessment_arm_1	55	receptive	PLS	0101	44	Language
10	0101-2003-0102	year_1_complete_71_arm_1	77	receptive	PLS	0101	54	Language
11	0101-2003-0102	year_2_complete_71_arm_1	93	receptive	CELF-P2	0101	68	Language

In [23]:

language.to_csv(DATA_DIR+'language.csv')

Cleaning articulation dataset¶

We converted the articulation dataset into a "long" format:

In [24]:

# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]

Goldman                 5863
Arizonia                 563
Arizonia and Goldman      94
Name: test_type, dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [25]:

articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [26]:

articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())

count    6518.000000
mean       67.943081
std        29.729030
min        23.000000
25%        47.000000
50%        60.000000
75%        80.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [27]:

# articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()

Out[27]:

	study_id	redcap_event_name	age_test_aaps	aaps_ss	age_test_gf2	gf2_ss	test_type	score	school	age_test	domain
1	0101-2002-0101	year_1_complete_71_arm_1	NaN	NaN	80.0	78.0	Goldman	78.0	0101	80.0	Articulation
9	0101-2003-0102	initial_assessment_arm_1	NaN	NaN	44.0	72.0	Goldman	72.0	0101	44.0	Articulation
10	0101-2003-0102	year_1_complete_71_arm_1	NaN	NaN	54.0	97.0	Goldman	97.0	0101	54.0	Articulation
14	0101-2004-0101	year_2_complete_71_arm_1	NaN	NaN	53.0	75.0	Goldman	75.0	0101	53.0	Articulation
15	0101-2004-0101	year_3_complete_71_arm_1	NaN	NaN	66.0	80.0	Goldman	80.0	0101	66.0	Articulation

In [28]:

articulation.to_csv(DATA_DIR+'articulation.csv')

Cleaning demographic dataset¶

We excluded unwanted columns and rows for which age, gender or race were missing:

In [29]:

# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [30]:

demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))

False    14619
True      3431
Name: non_english, dtype: int64
There are 501 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

0=no high school diploma
1=high school
2=undergraduate
3=graduate

Category 6 (unknown) was recoded as missing.

In [31]:

demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))

_mother_ed:
6.0    6008
4.0    4088
3.0    2762
5.0    2228
2.0    1977
1.0     605
0.0     273
Name: _mother_ed, dtype: int64
mother_ed:
1.0    4739
2.0    4088
3.0    2228
0.0     878
Name: mother_ed, dtype: int64

There are 6618 null values for mother_ed

In [32]:

parent_hl_lookup = {0: "Both parents do not have a hearing loss",
        1: "Both parents have hearing loss",
        2: "Mother has hearing loss",
        3: "Father has hearing loss",
        4: "Unknown"}

In [33]:

demographic['parent_hearing_loss'] = demographic.parent_hl.replace(parent_hl_lookup)

Secondary diagnosis

In [34]:

demographic.shape

Out[34]:

(18551, 59)

In [35]:

demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None

In [36]:

demographic.secondary_diagnosis.value_counts()

Out[36]:

0.0    13659
1.0     2960
Name: secondary_diagnosis, dtype: int64

In [37]:

demographic.secondary_diagnosis.mean()

Out[37]:

0.17810939286358987

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [38]:

demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))

There are 3497 null values for premature_weeks

In [39]:

demographic.premature_weeks.value_counts()

Out[39]:

0.0     12994
2.0       754
4.0       472
12.0      229
6.0       211
10.0      195
8.0       147
14.0       47
16.0        5
Name: premature_weeks, dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [40]:

demographic.tech_ad.value_counts()

Out[40]:

1.0     6460
0.0     5461
7.0     2040
5.0     1279
2.0      727
6.0      503
8.0      100
9.0       86
4.0       42
3.0       28
10.0      11
Name: tech_ad, dtype: int64

In [41]:

tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None

demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None

In [42]:

demographic.tech_left.value_counts()

Out[42]:

2.0    8656
3.0    5542
0.0    2412
4.0      68
1.0      21
Name: tech_left, dtype: int64

In [43]:

demographic.tech_right.value_counts()

Out[43]:

2.0    8519
3.0    6064
0.0    2040
4.0      86
1.0      28
Name: tech_right, dtype: int64

Substitute valid missing values for hearing loss:

In [44]:

demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [45]:

demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in maximum
  """Entry point for launching an IPython kernel.

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

0=none
1=one ear
2=both ears.

In [46]:

demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))

oad:
0    5842
1       4
2       2
Name: oad, dtype: int64
There are 1764 null values for OAD

hearing_aid:
2    2771
0    2022
1    1024
Name: hearing_aid, dtype: int64
There are 1814 null values for hearing_aid

cochlear:
0    3993
2    1135
1     720
Name: cochlear, dtype: int64
There are 1764 null values for cochlear
18551

Identify bilateral and bimodal individuals:

In [47]:

demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["unilateral_ha"] = demographic.hearing_aid==1
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)

In [48]:

demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()

Out[48]:

(4561, 7014, 1711, 2484)

In [49]:

demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()

Out[49]:

unilateral_ci     720
bilateral_ci     1135
bilateral_ha     2771
bimodal           440
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [50]:

demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))

There are 0 null values for tech

In [51]:

demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()

Out[51]:

6    7014
3    4561
4    1711
1    1411
0     753
8      16
2      11
7       5
5       1
Name: implant_category, dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [52]:

# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [53]:

demographic.age_diag.isnull().sum()

Out[53]:

In [54]:

demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})

Child has another diagnosed disability

In [55]:

demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None

In [56]:

# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [57]:

demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [58]:

races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column

_race:
0.0    9520
2.0    3376
1.0    1711
3.0    1371
6.0     980
8.0     640
7.0     316
4.0      76
5.0      52
Name: _race, dtype: int64
race:
0.0    9520
2.0    3376
4.0    1748
1.0    1711
3.0    1371
Name: race, dtype: int64
There are 825 null values for race

Recode implant technology variables

In [59]:

tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)

In [60]:

demographic.to_csv(DATA_DIR+'demographics.csv')

Cleaning expressive vocabulary dataset¶

We converted the expressive vocabulary dataset to "long" format:

In [61]:

# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]

There are 0 null values for test_type

In [62]:

expressive.test_type.value_counts()

Out[62]:

EVT               4666
EOWPVT            3290
EOWPVT and EVT     205
Name: test_type, dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [63]:

expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [64]:

expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [65]:

# expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()

Out[65]:

	study_id	redcap_event_name	age_test_eowpvt	eowpvt_ss	age_test_evt	evt_ss	test_type	score	school	age_test	domain
0	0101-2002-0101	initial_assessment_arm_1	54.0	58.0	NaN	NaN	EOWPVT	58.0	0101	54.0	Expressive Vocabulary
2	0101-2002-0101	year_2_complete_71_arm_1	80.0	84.0	NaN	NaN	EOWPVT	84.0	0101	80.0	Expressive Vocabulary
5	0101-2002-0101	year_5_complete_71_arm_1	113.0	90.0	NaN	NaN	EOWPVT	90.0	0101	113.0	Expressive Vocabulary
14	0101-2004-0101	year_2_complete_71_arm_1	53.0	90.0	NaN	NaN	EOWPVT	90.0	0101	53.0	Expressive Vocabulary
15	0101-2004-0101	year_3_complete_71_arm_1	66.0	87.0	NaN	NaN	EOWPVT	87.0	0101	66.0	Expressive Vocabulary

In [66]:

expressive.to_csv(DATA_DIR+'expressive_vocabulary.csv')

Cleaning receptive vocabulary dataset¶

We converted the receptive vocabulary data table to "long" format:

In [67]:

# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]

There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [68]:

receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [69]:

receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]

In [70]:

print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))

There are 25 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [71]:

# receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()

Out[71]:

	study_id	redcap_event_name	age_test_ppvt	ppvt_ss	age_test_rowpvt	rowpvt_ss	test_type	score	school	age_test	domain
2	0101-2002-0101	year_2_complete_71_arm_1	80.0	90.0	NaN	NaN	PPVT	90.0	0101	80.0	Receptive Vocabulary
5	0101-2002-0101	year_5_complete_71_arm_1	NaN	NaN	113.0	101.0	ROWPVT	101.0	0101	113.0	Receptive Vocabulary
9	0101-2003-0102	initial_assessment_arm_1	44.0	55.0	NaN	NaN	PPVT	55.0	0101	44.0	Receptive Vocabulary
10	0101-2003-0102	year_1_complete_71_arm_1	54.0	80.0	NaN	NaN	PPVT	80.0	0101	54.0	Receptive Vocabulary
11	0101-2003-0102	year_2_complete_71_arm_1	68.0	101.0	NaN	NaN	PPVT	101.0	0101	68.0	Receptive Vocabulary

In [72]:

receptive.study_id.unique().shape

Out[72]:

(3630,)

In [73]:

receptive.to_csv(DATA_DIR+'receptive_vocabulary.csv')

Merge datasets¶

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [74]:

test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [75]:

lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')

In [76]:

lsl_dr.tail()

Out[76]:

	redcap_event_name	academic_year_rv	male	sib	_mother_ed	father_ed	...	domain	eowpvt_ss	evt_ss	gf2_ss	ppvt_ss	rowpvt_ss	school	score	test_name	test_type
47072	year_9_complete_71_arm_1	2015.0	0.0	1.0	6.0	6.0	...	Receptive Vocabulary	NaN	NaN	NaN	97.0	NaN	0102	97	NaN	PPVT
47073	year_9_complete_71_arm_1	2012.0	0.0	3.0	4.0	4.0	...	Articulation	NaN	NaN	NaN	NaN	NaN	0735	75	NaN	Arizonia
47074	year_9_complete_71_arm_1	2012.0	0.0	3.0	4.0	4.0	...	Expressive Vocabulary	NaN	79.0	NaN	NaN	NaN	0735	79	NaN	EVT
47075	year_9_complete_71_arm_1	2012.0	0.0	3.0	4.0	4.0	...	Receptive Vocabulary	NaN	NaN	NaN	73.0	NaN	0735	73	NaN	PPVT
47076	year_9_complete_71_arm_1	2012.0	1.0	3.0	4.0	4.0	...	Articulation	NaN	NaN	100.0	NaN	NaN	0521	100	NaN	Goldman

5 rows × 97 columns

Convert score to floating-point number

In [77]:

lsl_dr.score = lsl_dr.score.astype(float)

Export dataset

In [78]:

lsl_dr.to_csv(DATA_DIR+'lsl_dr.csv')

In [79]:

lsl_dr.shape

Out[79]:

(47077, 97)