# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
Connect to database to import data for the three test domains and demographic information:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()
lsl_dr_project = Project(api_url, api_key)
metadata = lsl_dr_project.export_metadata()
# for i,j in zip(lsl_dr_project.field_names,
# lsl_dr_project.field_labels):
# print('{0}: \t{1}'.format(i,j))
Import each database from REDCap:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
records = lsl_dr_project.export_records(fields=articulation_fields)
print(records[0]['study_id'])
0101-2003-0101
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', df_kwargs={'index_col':None,
'na_values':[999, 9999]})
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', df_kwargs={'index_col':None,
'na_values':[999, 9999]})
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
'owls_lc_ss','owls_oe_ss','age_test_owls',
'celfp_rl_ss','celfp_el_ss','age_test_celp',
'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', df_kwargs={'index_col':None,
'na_values':[999, 9999]})
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
Several fields in the demographic data have missing values.
demographic_raw.head()
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 2002-2003 | 0 | 0 | 0 | 0 | 1 | 6 | 6 | ... | 2 | 2 | 54 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0101-2003-0101 | year_1_complete_71_arm_1 | 2003-2004 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 4 | 80 | 1 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 2004-2005 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4 | 4 | 80 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 0101-2003-0101 | year_3_complete_71_arm_1 | 2005-2006 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5 | 5 | 96 | 3 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 0101-2003-0101 | year_4_complete_71_arm_1 | 2006-2007 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5 | 5 | 109 | 2 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 46 columns
We can fill missing values forward from previous observation (by study_id
)
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
lambda recs: recs.fillna(method='ffill')).reset_index()
demographic["study_id"] = demographic_raw.study_id
Demographic data without missing values:
demographic.head()
index | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 699 | initial_assessment_arm_1 | NaN | 0 | 0 | 0 | 6 | 3 | 5 | 4 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2003-0101 |
1 | 9715 | initial_assessment_arm_1 | 2009-2010 | 0 | 1 | 8 | 0 | 0 | 6 | 6 | ... | 3 | 50 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2003-0101 |
2 | 9719 | initial_assessment_arm_1 | 2009-2010 | 0 | 1 | 6 | 0 | 1 | 6 | 6 | ... | 3 | 32 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2003-0101 |
3 | 9723 | initial_assessment_arm_1 | 2009-2010 | 0 | 0 | 6 | 0 | 1 | 4 | 4 | ... | 4 | 25 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2003-0101 |
4 | 946 | initial_assessment_arm_1 | 2011-2012 | 0 | 1 | 1 | 0 | 0 | 6 | 6 | ... | 0 | 30 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2003-0101 |
5 rows × 47 columns
5 language measures:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()
language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls
language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()
language1["test_type"] = "receptive"
language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"
language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss
language2["test_type"] = "expressive"
language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"
language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss
language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type expressive receptive test_name CELF-4 539 489 CELF-P2 1168 1174 OWLS 867 873 PLS 2884 2893 There are 0 null values for score
A school
variable was added, which is the first four columns of the study_id
:
language["school"] = language.study_id.str.slice(0,4)
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language |
We converted the articulation dataset into a "long" format:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 4250 Arizonia 485 Arizonia and Goldman 49 dtype: int64 There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
articulation["school"] = articulation.study_id.str.slice(0,4)
The age was taken to be the Arizonia age if there are both test types:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count 4781.000000 mean 69.179042 std 31.232998 min 23.000000 25% 47.000000 50% 60.000000 75% 81.000000 max 243.000000 Name: age_test, dtype: float64
Finally, we dropped unwanted columns and added a domain identification column for merging:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
study_id | redcap_event_name | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|
1 | 0101-2003-0101 | year_1_complete_71_arm_1 | Goldman | 78 | 0101 | 80 | Articulation |
9 | 0101-2003-0102 | initial_assessment_arm_1 | Goldman | 72 | 0101 | 44 | Articulation |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | Goldman | 97 | 0101 | 54 | Articulation |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | Goldman | 75 | 0101 | 53 | Articulation |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | Goldman | 80 | 0101 | 66 | Articulation |
We excluded unwanted columns and rows for which age, gender or race were missing:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})
Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False 9667 True 2087 dtype: int64 There are 714 null values for non_english
Mother's education (mother_ed
) and father's education (father_ed
) were both recoded to:
Category 6 (unknown) was recoded as missing.
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed: 6 4298 4 2483 3 1682 5 1280 2 1113 1 421 0 156 dtype: int64 mother_ed: 1 2795 2 2483 3 1280 0 577 dtype: int64 There are 5333 null values for mother_ed
Secondary diagnosis
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
demographic.secondary_diagnosis.value_counts()
0 9122 1 2120 dtype: int64
demographic.secondary_diagnosis.mean()
0.18857854474292832
Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3179 null values for premature_weeks
demographic.premature_weeks.value_counts()
0 7871 2 486 4 324 12 180 6 159 10 125 8 104 14 38 16 2 dtype: int64
Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.tech_left = np.abs(demographic.tech_left - 3)
Substitute valid missing values for hearing loss:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None
Create degree_hl
, which is the maximum level of hearing loss in either ear:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)
Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):
demographic["baha"] = 0
demographic.baha = demographic.baha.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'baha'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'baha'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'baha'] = None
print("baha:")
print(demographic.drop_duplicates(subset='study_id').baha.value_counts())
print("There are {0} null values for baha".format(sum(demographic.baha.isnull())))
demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))
demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
baha: 0 4121 1 136 2 67 dtype: int64 There are 1476 null values for baha hearing_aid: 0 1902 2 1723 1 679 dtype: int64 There are 1516 null values for hearing_aid cochlear: 0 2430 2 1112 1 782 dtype: int64 There are 1476 null values for cochlear 12468
Identify bilateral and bimodal individuals:
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum()
(2934, 4333, 1220)
Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==0),
'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==0),
'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==1),
'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.baha==0),
'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.baha==0),
'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.baha==1),
'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.baha==0),
'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.baha==1),
'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.baha==2),
'implant_category'] = 8
demographic.implant_category.value_counts()
6 4333 3 2934 4 1220 0 680 1 470 2 293 8 168 7 19 5 8 dtype: int64
Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.
demographic.onset_1.unique()
array([ nan, 4. , 14. , 24. , 19. , 0. , 41. , 1. , 17. , 5. , 13. , 30. , 43. , 16. , 32. , 8. , 9. , 25. , 2. , 3. , 33. , 34. , 36. , 6. , 40. , 31. , 22. , 39. , 7. , 12. , 42. , 15. , 18. , 57. , 26. , 28. , 52. , 11. , 59. , 29. , 23. , 10. , 21. , 38. , 20. , 46. , 37. , 47. , 60. , 48. , 1.5, 56. , 44. , 35. , 27. , 86. , 64. , 58. , 70. , 49. , 0.5, 51. , 55. , 50. , 119. , 72. , 88. , 65. , 66. , 54. , 116. , 78. , 83. , 61. , 107. , 74. , 77. , 62. , 53. , 63. , 84. , 140. , 80. , 126. , 85. , 133. , 81. , 103. , 87. , 76. , 45. , 68. , 92. , 67. , 2.5, 97. , 71. , 75. , 98. , 152. , 89. , 154. ])
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0,
# 'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1
Number of null values for age_diag
demographic.age_diag.isnull().sum()
3847
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
import seaborn as sb
unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()
ag = sb.factorplot("sex", data=unique_students,
palette="PuBuGn_d", kind='count')
ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()),
'Male ({})'.format((unique_students.male==1).sum())])
ag.set_xlabels('')
<seaborn.axisgrid.FacetGrid at 0x11c654d68>
Child has another diagnosed disability
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)
Missing sibling counts were properly encoded as None
(missing).
demographic.loc[demographic.sib==4, 'sib'] = None
We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race: 0 6519 2 2000 1 1155 3 861 6 587 8 463 7 223 4 58 5 25 dtype: int64 race: 0 6519 2 2000 1 1155 4 1133 3 861 dtype: int64 There are 800 null values for race
Recode implant technology variables
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan,
# 'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
# 'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
# '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
demographic['academic_year'] = demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'0000-0000': np.nan})
demographic['academic_year_start'] = demographic.academic_year.apply(lambda x: str(x).strip()[:4]).value_counts()
demographic.age_amp.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x117c634e0>
We converted the expressive vocabulary dataset to "long" format:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))
expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
expressive.test_type.value_counts()
EVT 3109 EOWPVT 2300 EOWPVT and EVT 120 dtype: int64
A school
variable was added, which is the first four columns of the study_id
:
expressive["school"] = expressive.study_id.str.slice(0,4)
The age was taken to be the EOWPVT age if there are both test types:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
Finally, we dropped unwanted columns and added a domain identification column for merging:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
0 | 0101-2003-0101 | initial_assessment_arm_1 | 58 | EOWPVT | 0101 | 54 | Expressive Vocabulary |
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 84 | EOWPVT | 0101 | 80 | Expressive Vocabulary |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 90 | EOWPVT | 0101 | 113 | Expressive Vocabulary |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | 90 | EOWPVT | 0101 | 53 | Expressive Vocabulary |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | 87 | EOWPVT | 0101 | 66 | Expressive Vocabulary |
We converted the receptive vocabulary data table to "long" format:
receptive.columns
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))
receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
receptive["school"] = receptive.study_id.str.slice(0,4)
The age was taken to be the PPVT age if there are both test types:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 28 null values for age_test
Finally, we dropped unwanted columns and added a domain identification column for merging:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
2 | 0101-2003-0101 | year_2_complete_71_arm_1 | 90 | PPVT | 0101 | 80 | Receptive Vocabulary |
5 | 0101-2003-0101 | year_5_complete_71_arm_1 | 101 | ROWPVT | 0101 | 113 | Receptive Vocabulary |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | PPVT | 0101 | 44 | Receptive Vocabulary |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 80 | PPVT | 0101 | 54 | Receptive Vocabulary |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 101 | PPVT | 0101 | 68 | Receptive Vocabulary |
The four datasets were mereged into a single table. First, we concatenate the test scores data:
test_scores = pd.concat([articulation, expressive, receptive, language])
Then we perform a merge between the demographic data and the test scores data:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
lsl_dr.tail()
index | redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | ... | known_synd | synd_or_disab | race | academic_year_start | age_test | domain | school | score | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
25269 | 10878 | year_9_complete_71_arm_1 | 2008-2009 | 0 | 1 | 0 | 0 | 1 | 3 | 2 | ... | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25270 | 1449 | year_9_complete_71_arm_1 | 2013-2014 | 0 | 1 | 0 | 0 | 1 | NaN | NaN | ... | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25271 | 1440 | year_9_complete_71_arm_1 | 2013-2014 | 0 | 1 | 2 | 0 | NaN | 6 | 6 | ... | 0 | 0 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25272 | 244 | year_9_complete_71_arm_1 | 2013-2014 | 0 | 1 | 0 | 6 | 2 | 6 | 6 | ... | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25273 | 4822 | year_9_complete_71_arm_1 | 2012-2013 | 0 | 1 | 0 | 0 | 3 | 4 | 4 | ... | 0 | 1 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 73 columns
lsl_dr.score.hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x118085ba8>
Export dataset
lsl_dr.to_csv('lsl_dr.csv')
lsl_dr.shape
(25274, 73)
lsl_dr.study_id.unique().shape
(4893,)
demographic.study_id.unique().shape
(4893,)
Convert score to floating-point number
lsl_dr.score = lsl_dr.score.astype(float)
plot_color = "#64AAE8"
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None,
ylim=None, **kwargs):
fig, ax = plt.subplots()
counts = series.value_counts().sort_index(1)
counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
if xlim is None:
plt.xlim(-0.5, len(counts)-0.5)
if ylim is not None:
plt.ylim(*ylim)
plt.ylabel('Count')
if labels is not None:
ax.set_xticklabels(labels)
for i,x in enumerate(counts):
plt.annotate('%i' % x, (i, x + label_offset))
fig.tight_layout()
unique_students = demographic.drop_duplicates('study_id')
unique_students.shape
(4893, 67)
unique_students.age.describe()
count 4650.000000 mean 31.476559 std 27.658987 min 0.000000 25% 10.000000 50% 26.000000 75% 42.000000 max 221.000000 Name: age, dtype: float64
plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, ylim=(0, 2600), color=plot_color)
plot_demo_data(unique_students.prim_lang,
('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'),
rot=70, color=plot_color)
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'),
color=plot_color)
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months",
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years",
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]
demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4537 null values for age_amp
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
labels=amp_ages))
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
plt.annotate('%i' % x, (i, x + 10))
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
<matplotlib.text.Text at 0x116eb8438>
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, color=plot_color, ylim=(0,2000))
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
def score_summary(domain, test_type=None):
subset = lsl_dr[lsl_dr.domain==domain].copy()
if test_type is not None:
subset = subset[subset.test_type==test_type]
subset['age_test'] = (subset.age_test/12).dropna().astype(int)
subset.loc[subset.age_test > 11, 'age_test'] = 11
subset = subset[subset.age_test>1]
byage = subset.groupby('age_test')
n = byage.study_id.count()
mean = byage.score.mean()
sd = byage.score.std()
min = byage.score.min()
max = byage.score.max()
summary = pd.DataFrame({'Sample Size':n, 'Mean':mean,
'SD':sd, 'Min':min, 'Max':max})
return summary[['Sample Size','Mean','SD','Min','Max']]
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 365 | 89.956164 | 19.590400 | 40 | 143 |
3 | 1088 | 91.164522 | 18.890610 | 40 | 139 |
4 | 770 | 88.429870 | 20.195620 | 20 | 137 |
5 | 532 | 81.971805 | 18.450145 | 20 | 120 |
6 | 270 | 81.548148 | 14.985603 | 40 | 110 |
7 | 194 | 81.747423 | 14.273898 | 49 | 130 |
8 | 153 | 74.856209 | 16.668761 | 40 | 120 |
9 | 98 | 68.153061 | 17.462981 | 34 | 93 |
10 | 80 | 73.562500 | 14.548778 | 41 | 100 |
11 | 63 | 64.698413 | 13.498668 | 43 | 107 |
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
(-0.5, 9.5)
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 381 | 90.889764 | 22.975237 | 50 | 141 |
3 | 1032 | 92.033915 | 22.110072 | 39 | 145 |
4 | 725 | 88.816552 | 23.213417 | 0 | 140 |
5 | 476 | 83.004202 | 19.098415 | 26 | 125 |
6 | 253 | 81.833992 | 17.057678 | 20 | 111 |
7 | 187 | 83.962567 | 15.558205 | 49 | 115 |
8 | 154 | 81.188312 | 14.109177 | 55 | 114 |
9 | 81 | 75.086420 | 18.305872 | 41 | 105 |
10 | 85 | 77.847059 | 14.929601 | 52 | 110 |
11 | 68 | 74.102941 | 10.905482 | 57 | 99 |
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
plt.ylim(0, 1100)
(0, 1100)
articulation_summary = score_summary("Articulation")
articulation_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 310 | 84.058065 | 16.079390 | 52 | 122 |
3 | 870 | 81.024138 | 18.826887 | 40 | 126 |
4 | 630 | 81.941270 | 19.455172 | 40 | 117 |
5 | 498 | 77.323293 | 21.931068 | 39 | 112 |
6 | 268 | 73.100746 | 24.007043 | 39 | 110 |
7 | 173 | 79.236994 | 20.594829 | 40 | 108 |
8 | 128 | 79.210938 | 20.123498 | 40 | 107 |
9 | 70 | 75.271429 | 20.355480 | 40 | 106 |
10 | 43 | 78.046512 | 21.542818 | 40 | 105 |
11 | 97 | 85.494845 | 15.293738 | 39 | 104 |
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
Language scores
lsl_dr.domain.unique()
array(['Expressive Vocabulary', 'Language', 'Articulation', 'Receptive Vocabulary', nan], dtype=object)
lsl_dr.test_type.unique()
array(['EOWPVT', 'receptive', 'expressive', 'Goldman', 'PPVT', nan, 'Arizonia', 'EVT', 'ROWPVT', 'Arizonia and Goldman', 'EOWPVT and EVT', 'PPVT and ROWPVT'], dtype=object)
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 891 | 80.783389 | 22.020807 | 50 | 150 |
3 | 944 | 81.379237 | 20.532473 | 50 | 131 |
4 | 661 | 80.316188 | 20.072630 | 43 | 132 |
5 | 457 | 78.770241 | 18.782742 | 49 | 127 |
6 | 222 | 71.225225 | 17.404782 | 45 | 115 |
7 | 148 | 75.824324 | 16.579931 | 41 | 108 |
8 | 153 | 67.764706 | 17.823611 | 40 | 123 |
9 | 14 | 52.642857 | 17.367552 | 40 | 82 |
10 | 16 | 69.437500 | 9.244593 | 58 | 88 |
11 | 20 | 82.000000 | 27.868110 | 40 | 130 |
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
age_test | |||||
2 | 884 | 85.030543 | 17.776904 | 50 | 150 |
3 | 948 | 79.734177 | 17.122609 | 50 | 135 |
4 | 652 | 76.124233 | 19.010666 | 48 | 126 |
5 | 469 | 72.812367 | 19.812556 | 45 | 127 |
6 | 221 | 63.923077 | 15.833465 | 44 | 103 |
7 | 156 | 66.576923 | 19.150689 | 45 | 114 |
8 | 155 | 59.948387 | 20.547630 | 40 | 112 |
9 | 14 | 48.357143 | 13.964830 | 40 | 79 |
10 | 16 | 68.250000 | 16.699301 | 40 | 107 |
11 | 20 | 81.750000 | 29.889049 | 40 | 128 |
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
(unique_students.age/12.).hist(grid=False, bins=np.sqrt(unique_students.shape[0]))
plt.ylabel('Count')
plt.xlabel('Age at enrollment')
<matplotlib.text.Text at 0x114f86588>
(unique_students.age/12.).describe()
count 4650.000000 mean 2.623047 std 2.304916 min 0.000000 25% 0.833333 50% 2.166667 75% 3.500000 max 18.416667 Name: age, dtype: float64
def calc_difference(x, col='a_fo', jitter=True):
if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
return None
diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
if jitter:
diff += np.random.normal(scale=0.05)
if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
print(x['funct_out_age'])
return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
<matplotlib.text.Text at 0x114170048>
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
<matplotlib.text.Text at 0x114f84518>
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
<matplotlib.text.Text at 0x11418bb00>
lsl_dr.degree_hl.dropna().value_counts()
6 10446 3 3127 4 3041 5 2705 2 1416 0 549 1 170 dtype: int64
ax = lsl_dr.degree_hl.hist(bins=7)
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x113fad438>
(lsl_dr.age_int<6).mean()
0.1993352852734035
(lsl_dr.age<6).mean()
0.15173696288676111