# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
DATA_DIR = '../data/clean/'
Connect to database to import data for the three test domains and demographic information:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()
lsl_dr_project = Project(api_url, api_key)
metadata = lsl_dr_project.export_metadata()
Import each database from REDCap:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None,
'na_values':[999, 9999]})
records = lsl_dr_project.export_records(fields=articulation_fields)
print(records[0]['study_id'])
0101-2002-0101
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
'owls_lc_ss','owls_oe_ss','age_test_owls',
'celfp_rl_ss','celfp_el_ss','age_test_celp',
'celf_elss','celf_rlss','age_test_celf',
'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',
'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',
'celfp_wct_ss']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year_rv',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age', 'parent_hl', 'med_cause', 'known_synd', 'school_grade',
'att_days_hr', 'att_days_sch', 'att_days_st2_417', 'optionserv_type', 'option_pop', 'otherserv']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[888, 999, 9999]})
/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2802: DtypeWarning: Columns (27,33) have mixed types. Specify dtype option on import or set low_memory=False. if self.run_code(code, result):
Several fields in the demographic data have missing values.
demographic_raw.head()
study_id | redcap_event_name | academic_year_rv | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 2002.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 | 6.0 | ... | 2.0 | 2.0 | 54.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0101-2002-0101 | year_1_complete_71_arm_1 | 2003.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 4.0 | 80.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 2004.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 4.0 | 80.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 0101-2002-0101 | year_3_complete_71_arm_1 | 2005.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5.0 | 5.0 | 96.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 0101-2002-0101 | year_4_complete_71_arm_1 | 2006.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5.0 | 5.0 | 109.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 56 columns
We can fill missing values forward from previous observation (by study_id
)
demographic = demographic_raw.sort_values(by='redcap_event_name').groupby('study_id').transform(
lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort_values(by='redcap_event_name').study_id
Random check to make sure this worked
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year_rv | hl | gender | race | prim_lang | sib | mother_ed | father_ed | parent_hl | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
17762 | initial_assessment_arm_1 | 2010.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | NaN | ... | 6.0 | 65.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
17763 | year_1_complete_71_arm_1 | 2011.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | NaN | ... | 5.0 | 77.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
17764 | year_2_complete_71_arm_1 | 2012.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | NaN | ... | 5.0 | 89.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
17765 | year_3_complete_71_arm_1 | 2013.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | NaN | ... | 5.0 | 101.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
4 rows × 56 columns
Demographic data without missing values:
demographic.head()
redcap_event_name | academic_year_rv | hl | gender | race | prim_lang | sib | mother_ed | father_ed | parent_hl | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | initial_assessment_arm_1 | 2002.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 | 6.0 | 0.0 | ... | 2.0 | 54.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2002-0101 |
9458 | initial_assessment_arm_1 | 2009.0 | 0.0 | 1.0 | 2.0 | 0.0 | 1.0 | 3.0 | 3.0 | 0.0 | ... | 5.0 | 135.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-1999-0506 |
9463 | initial_assessment_arm_1 | 2009.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 2.0 | 0.0 | 0.0 | ... | 4.0 | 175.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2000-0308 |
9469 | initial_assessment_arm_1 | 2009.0 | 0.0 | 0.0 | 6.0 | 0.0 | 1.0 | 3.0 | 5.0 | NaN | ... | 6.0 | 100.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2001-0034 |
9473 | initial_assessment_arm_1 | 2009.0 | 0.0 | 0.0 | 2.0 | 0.0 | 2.0 | 2.0 | 2.0 | 0.0 | ... | 4.0 | 154.0 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2001-0289 |
5 rows × 56 columns
5 language measures:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()
language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls
language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()
language1["test_type"] = "receptive"
language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"
language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss
language2["test_type"] = "expressive"
language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"
language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss
language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type expressive receptive test_name CELF-4 682 584 CELF-P2 1927 1933 OWLS 1370 1376 PLS 4603 4613 There are 0 null values for score
A school
variable was added, which is the first four columns of the study_id
:
language["school"] = language.study_id.str.slice(0,4)
language_subtest = language[["study_id", "redcap_event_name", "score", "test_type",
"test_name", "school", "age_test",
'celfp_ss_ss', 'celfp_ws_ss',
'celfp_ev_ss', 'celfp_fd_ss',
'celfp_rs_ss', 'celfp_bc_ss',
'celfp_wcr_ss', 'celfp_wce_ss',
'celfp_wct_ss']]
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language |
language.to_csv(DATA_DIR+'language.csv')
We converted the articulation dataset into a "long" format:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 5863 Arizonia 563 Arizonia and Goldman 94 Name: test_type, dtype: int64 There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
articulation["school"] = articulation.study_id.str.slice(0,4)
The age was taken to be the Arizonia age if there are both test types:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count 6518.000000 mean 67.943081 std 29.729030 min 23.000000 25% 47.000000 50% 60.000000 75% 80.000000 max 243.000000 Name: age_test, dtype: float64
Finally, we dropped unwanted columns and added a domain identification column for merging:
# articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
study_id | redcap_event_name | age_test_aaps | aaps_ss | age_test_gf2 | gf2_ss | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0101-2002-0101 | year_1_complete_71_arm_1 | NaN | NaN | 80.0 | 78.0 | Goldman | 78.0 | 0101 | 80.0 | Articulation |
9 | 0101-2003-0102 | initial_assessment_arm_1 | NaN | NaN | 44.0 | 72.0 | Goldman | 72.0 | 0101 | 44.0 | Articulation |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | NaN | NaN | 54.0 | 97.0 | Goldman | 97.0 | 0101 | 54.0 | Articulation |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | NaN | NaN | 53.0 | 75.0 | Goldman | 75.0 | 0101 | 53.0 | Articulation |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | NaN | NaN | 66.0 | 80.0 | Goldman | 80.0 | 0101 | 66.0 | Articulation |
articulation.to_csv(DATA_DIR+'articulation.csv')
We excluded unwanted columns and rows for which age, gender or race were missing:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})
Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False 14619 True 3431 Name: non_english, dtype: int64 There are 501 null values for non_english
Mother's education (mother_ed
) and father's education (father_ed
) were both recoded to:
Category 6 (unknown) was recoded as missing.
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed: 6.0 6008 4.0 4088 3.0 2762 5.0 2228 2.0 1977 1.0 605 0.0 273 Name: _mother_ed, dtype: int64 mother_ed: 1.0 4739 2.0 4088 3.0 2228 0.0 878 Name: mother_ed, dtype: int64 There are 6618 null values for mother_ed
parent_hl_lookup = {0: "Both parents do not have a hearing loss",
1: "Both parents have hearing loss",
2: "Mother has hearing loss",
3: "Father has hearing loss",
4: "Unknown"}
demographic['parent_hearing_loss'] = demographic.parent_hl.replace(parent_hl_lookup)
Secondary diagnosis
demographic.shape
(18551, 59)
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
demographic.secondary_diagnosis.value_counts()
0.0 13659 1.0 2960 Name: secondary_diagnosis, dtype: int64
demographic.secondary_diagnosis.mean()
0.17810939286358987
Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3497 null values for premature_weeks
demographic.premature_weeks.value_counts()
0.0 12994 2.0 754 4.0 472 12.0 229 6.0 211 10.0 195 8.0 147 14.0 47 16.0 5 Name: premature_weeks, dtype: int64
Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):
demographic.tech_ad.value_counts()
1.0 6460 0.0 5461 7.0 2040 5.0 1279 2.0 727 6.0 503 8.0 100 9.0 86 4.0 42 3.0 28 10.0 11 Name: tech_ad, dtype: int64
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None
demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
demographic.tech_left.value_counts()
2.0 8656 3.0 5542 0.0 2412 4.0 68 1.0 21 Name: tech_left, dtype: int64
demographic.tech_right.value_counts()
2.0 8519 3.0 6064 0.0 2040 4.0 86 1.0 28 Name: tech_right, dtype: int64
Substitute valid missing values for hearing loss:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None
Create degree_hl
, which is the maximum level of hearing loss in either ear:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in maximum """Entry point for launching an IPython kernel.
Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))
demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))
demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad: 0 5842 1 4 2 2 Name: oad, dtype: int64 There are 1764 null values for OAD hearing_aid: 2 2771 0 2022 1 1024 Name: hearing_aid, dtype: int64 There are 1814 null values for hearing_aid cochlear: 0 3993 2 1135 1 720 Name: cochlear, dtype: int64 There are 1764 null values for cochlear 18551
Identify bilateral and bimodal individuals:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["unilateral_ha"] = demographic.hearing_aid==1
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
(4561, 7014, 1711, 2484)
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci',
'bilateral_ha',
'bimodal']].sum()
unilateral_ci 720 bilateral_ci 1135 bilateral_ha 2771 bimodal 440 dtype: int64
Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0),
'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1),
'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2),
'implant_category'] = 8
demographic.implant_category.value_counts()
6 7014 3 4561 4 1711 1 1411 0 753 8 16 2 11 7 5 5 1 Name: implant_category, dtype: int64
Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0,
# 'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1
Number of null values for age_diag
demographic.age_diag.isnull().sum()
3928
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
Child has another diagnosed disability
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)
Missing sibling counts were properly encoded as None
(missing).
demographic.loc[demographic.sib==4, 'sib'] = None
We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race: 0.0 9520 2.0 3376 1.0 1711 3.0 1371 6.0 980 8.0 640 7.0 316 4.0 76 5.0 52 Name: _race, dtype: int64 race: 0.0 9520 2.0 3376 4.0 1748 1.0 1711 3.0 1371 Name: race, dtype: int64 There are 825 null values for race
Recode implant technology variables
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
demographic.to_csv(DATA_DIR+'demographics.csv')
We converted the expressive vocabulary dataset to "long" format:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))
expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
expressive.test_type.value_counts()
EVT 4666 EOWPVT 3290 EOWPVT and EVT 205 Name: test_type, dtype: int64
A school
variable was added, which is the first four columns of the study_id
:
expressive["school"] = expressive.study_id.str.slice(0,4)
The age was taken to be the EOWPVT age if there are both test types:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
Finally, we dropped unwanted columns and added a domain identification column for merging:
# expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
study_id | redcap_event_name | age_test_eowpvt | eowpvt_ss | age_test_evt | evt_ss | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 54.0 | 58.0 | NaN | NaN | EOWPVT | 58.0 | 0101 | 54.0 | Expressive Vocabulary |
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 80.0 | 84.0 | NaN | NaN | EOWPVT | 84.0 | 0101 | 80.0 | Expressive Vocabulary |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 113.0 | 90.0 | NaN | NaN | EOWPVT | 90.0 | 0101 | 113.0 | Expressive Vocabulary |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | 53.0 | 90.0 | NaN | NaN | EOWPVT | 90.0 | 0101 | 53.0 | Expressive Vocabulary |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | 66.0 | 87.0 | NaN | NaN | EOWPVT | 87.0 | 0101 | 66.0 | Expressive Vocabulary |
expressive.to_csv(DATA_DIR+'expressive_vocabulary.csv')
We converted the receptive vocabulary data table to "long" format:
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))
receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
receptive["school"] = receptive.study_id.str.slice(0,4)
The age was taken to be the PPVT age if there are both test types:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 25 null values for age_test
Finally, we dropped unwanted columns and added a domain identification column for merging:
# receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
study_id | redcap_event_name | age_test_ppvt | ppvt_ss | age_test_rowpvt | rowpvt_ss | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|---|---|---|
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 80.0 | 90.0 | NaN | NaN | PPVT | 90.0 | 0101 | 80.0 | Receptive Vocabulary |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | NaN | NaN | 113.0 | 101.0 | ROWPVT | 101.0 | 0101 | 113.0 | Receptive Vocabulary |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 44.0 | 55.0 | NaN | NaN | PPVT | 55.0 | 0101 | 44.0 | Receptive Vocabulary |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 54.0 | 80.0 | NaN | NaN | PPVT | 80.0 | 0101 | 54.0 | Receptive Vocabulary |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 68.0 | 101.0 | NaN | NaN | PPVT | 101.0 | 0101 | 68.0 | Receptive Vocabulary |
receptive.study_id.unique().shape
(3630,)
receptive.to_csv(DATA_DIR+'receptive_vocabulary.csv')
The four datasets were mereged into a single table. First, we concatenate the test scores data:
test_scores = pd.concat([articulation, expressive, receptive, language])
Then we perform a merge between the demographic data and the test scores data:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
lsl_dr.tail()
redcap_event_name | academic_year_rv | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | parent_hl | ... | domain | eowpvt_ss | evt_ss | gf2_ss | ppvt_ss | rowpvt_ss | school | score | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
47072 | year_9_complete_71_arm_1 | 2015.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 | 6.0 | 0.0 | ... | Receptive Vocabulary | NaN | NaN | NaN | 97.0 | NaN | 0102 | 97 | NaN | PPVT |
47073 | year_9_complete_71_arm_1 | 2012.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 4.0 | 4.0 | 0.0 | ... | Articulation | NaN | NaN | NaN | NaN | NaN | 0735 | 75 | NaN | Arizonia |
47074 | year_9_complete_71_arm_1 | 2012.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 4.0 | 4.0 | 0.0 | ... | Expressive Vocabulary | NaN | 79.0 | NaN | NaN | NaN | 0735 | 79 | NaN | EVT |
47075 | year_9_complete_71_arm_1 | 2012.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 4.0 | 4.0 | 0.0 | ... | Receptive Vocabulary | NaN | NaN | NaN | 73.0 | NaN | 0735 | 73 | NaN | PPVT |
47076 | year_9_complete_71_arm_1 | 2012.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.0 | 4.0 | 4.0 | 0.0 | ... | Articulation | NaN | NaN | 100.0 | NaN | NaN | 0521 | 100 | NaN | Goldman |
5 rows × 97 columns
Convert score to floating-point number
lsl_dr.score = lsl_dr.score.astype(float)
Export dataset
lsl_dr.to_csv(DATA_DIR+'lsl_dr.csv')
lsl_dr.shape
(47077, 97)