#!/usr/bin/env python # coding: utf-8 # In[1]: # Import modules and set options get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np # Connect to database to import data for the three test domains and demographic information: # In[2]: from redcap import Project api_url = 'https://redcap.vanderbilt.edu/api/' api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read() lsl_dr_project = Project(api_url, api_key) # In[3]: metadata = lsl_dr_project.export_metadata() # In[4]: # for i,j in zip(lsl_dr_project.field_names, # lsl_dr_project.field_labels): # print('{0}: \t{1}'.format(i,j)) # Import each database from REDCap: # In[5]: articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss'] articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[6]: records = lsl_dr_project.export_records(fields=articulation_fields) # In[7]: print(records[0]['study_id']) # In[8]: expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss'] expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[9]: receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss'] receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[10]: language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls', 'owls_lc_ss','owls_oe_ss','age_test_owls', 'celfp_rl_ss','celfp_el_ss','age_test_celp', 'celf_elss','celf_rlss','age_test_celf', 'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss', 'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss', 'celfp_wct_ss'] language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[11]: demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year', 'academic_year_rv', 'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race', 'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as', 'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae', 'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes', 'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age', 'att_days_hr', 'att_days_sch', 'att_days_st2_417'] demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', df_kwargs={'index_col':None, 'na_values':[888, 999, 9999]}) # In[12]: demographic_raw.academic_year_rv.value_counts() # In[13]: demographic_raw[demographic_raw.study_id=='1147-2010-0064'] # ## Attendance information # Several fields in the demographic data have missing values. # In[14]: demographic_raw.head() # We can fill missing values forward from previous observation (by `study_id`) # In[15]: demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform( lambda recs: recs.fillna(method='ffill'))#.reset_index() demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id # Random check to make sure this worked # In[16]: demographic[demographic.study_id=='1147-2010-0064'] # Demographic data without missing values: # In[17]: demographic.head() # ## Cleaning languge dataset # # 5 language measures: # # - 3 versions of CELF # - PLS # - pls_ac_rs: PLS: Auditory Comprehension Raw Score # - pls_ac_ss: PLS: Auditory Comprehension Standard Score # - pls_ec_rs: PLS: Expressive Communication Raw Score # - pls_ec_ss: PLS: Expressive Communication Standard Score # - pls_tl_rs: PLS: Total Language Score Standard Score Total # - pls_tl_ss: PLS: Total Language Score Standard Score # - OWLS # - age_test_owls: Age at time of testing (OWLS) # - owls_lc_rs: OWLS: Listening Comprehension Raw Score # - owls_lc_ss: OWLS: Listening Comprehension Standard Score # - owls_oe_rs: OWLS: Oral Expression Raw Score # - owls_oe_ss: OWLS: Oral Expression Standard Score # - owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores # - owls_oc_ss: OWLS: Oral Composite Standard Score # - owls_wes_trs: OWLS: Written Expression Scale Total Raw Score # - owls_wes_as: OWLS: Written Expression Scale Ability Score # - owls_wes_ss: OWLS: Written Expression Scale Standard Score # - owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score) # - owls_lcss: OWLS: Language Composite Standard Score # In[18]: # Test type language_raw["test_name"] = None language_raw["test_type"] = None language_raw["score"] = None CELP = language_raw.age_test_celp.notnull() CELF = language_raw.age_test_celf.notnull() PLS = language_raw.age_test_pls.notnull() OWLS = language_raw.age_test_owls.notnull() language_raw['age_test'] = None language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls language1 = language_raw[CELP | CELF | PLS | OWLS].copy() language2 = language1.copy() language1["test_type"] = "receptive" language1.loc[CELP, "test_name"] = "CELF-P2" language1.loc[CELF, "test_name"] = "CELF-4" language1.loc[PLS, "test_name"] = "PLS" language1.loc[OWLS, "test_name"] = "OWLS" language1.loc[CELP, "score"] = language1.celfp_rl_ss language1.loc[CELF, "score"] = language1.celf_rlss language1.loc[PLS, "score"] = language1.pls_ac_ss language1.loc[OWLS, "score"] = language1.owls_lc_ss language2["test_type"] = "expressive" language2.loc[CELP, "test_name"] = "CELF-P2" language2.loc[CELF, "test_name"] = "CELF-4" language2.loc[PLS, "test_name"] = "PLS" language2.loc[OWLS, "test_name"] = "OWLS" language2.loc[CELP, "score"] = language1.celfp_el_ss language2.loc[CELF, "score"] = language1.celf_elss language2.loc[PLS, "score"] = language1.pls_ec_ss language2.loc[OWLS, "score"] = language1.owls_oe_ss language = pd.concat([language1, language2]) language = language[language.score.notnull()] print(pd.crosstab(language.test_name, language.test_type)) print("There are {0} null values for score".format(sum(language["score"].isnull()))) # A `school` variable was added, which is the first four columns of the `study_id`: # In[19]: language["school"] = language.study_id.str.slice(0,4) # In[20]: language_subtest = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test", 'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss', 'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss', 'celfp_wct_ss']] # In[21]: language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]] language["domain"] = "Language" language.head() # ## Cleaning articulation dataset # # We converted the articulation dataset into a "long" format: # In[22]: # Test type articulation["test_type"] = None ARIZ = articulation.aaps_ss.notnull() GF = articulation.gf2_ss.notnull() articulation = articulation[ARIZ | GF] articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman" articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia" articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman" print(articulation.test_type.value_counts()) print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull()))) # Test score (Arizonia if both) articulation["score"] = articulation.aaps_ss articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF] # A `school` variable was added, which is the first four columns of the `study_id`: # In[23]: articulation["school"] = articulation.study_id.str.slice(0,4) # The age was taken to be the Arizonia age if there are both test types: # In[24]: articulation["age_test"] = articulation.age_test_aaps articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()] print(articulation.age_test.describe()) # Finally, we dropped unwanted columns and added a domain identification column for merging: # In[25]: articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1) articulation["domain"] = "Articulation" articulation.head() # ## Cleaning demographic dataset # # We excluded unwanted columns and rows for which age, gender or race were missing: # In[26]: # Retain only subset of columns #demographic = demographic[demographic.gender.notnull()] demographic = demographic.rename(columns={'gender':'male'}) # Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1): # In[27]: demographic["non_english"] = None demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0 print(demographic.non_english.value_counts()) print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull()))) # Mother's education (`mother_ed`) and father's education (`father_ed`) were both recoded to: # # * 0=no high school diploma # * 1=high school # * 2=undergraduate # * 3=graduate # # Category 6 (unknown) was recoded as missing. # In[28]: demographic = demographic.rename(columns={"mother_ed":"_mother_ed"}) demographic["mother_ed"] = demographic._mother_ed.copy() demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0 demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1 demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2 demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3 demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None print("_mother_ed:") print(demographic._mother_ed.value_counts()) print("mother_ed:") print(demographic.mother_ed.value_counts()) print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull()))) # Secondary diagnosis # In[29]: demographic.shape # In[30]: demographic['secondary_diagnosis'] = demographic.etiology==0 # Suspected or unknown treated as missing demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None # In[31]: demographic.secondary_diagnosis.value_counts() # In[32]: demographic.secondary_diagnosis.mean() # Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks. # In[33]: demographic['premature_weeks'] = demographic.premature_age.copy() demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None demographic.premature_weeks = abs(demographic.premature_weeks-8)*2 print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull()))) # In[34]: demographic.premature_weeks.value_counts() # Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant): # In[35]: demographic.tech_ad.value_counts() # In[36]: tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"] demographic["tech_right"] = 4 demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0 demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1 demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2 demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3 demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None demographic["tech_left"] = 4 demographic.loc[demographic.tech_as==7, 'tech_left'] = 0 demographic.loc[demographic.tech_as==3, 'tech_left'] = 1 demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2 demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3 demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None # In[37]: demographic.tech_left.value_counts() # In[38]: demographic.tech_right.value_counts() # Substitute valid missing values for hearing loss: # In[39]: demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None # Create `degree_hl`, which is the maximum level of hearing loss in either ear: # In[40]: demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as) # Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant): # # * 0=none # * 1=one ear # * 2=both ears. # In[41]: demographic.columns # In[42]: demographic["oad"] = 0 demographic.oad = demographic.oad.astype(object) demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1 demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2 demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None print("oad:") print(demographic.drop_duplicates(subset='study_id').oad.value_counts()) print("There are {0} null values for OAD".format(sum(demographic.oad.isnull()))) demographic["hearing_aid"] = 0 demographic.hearing_aid = demographic.hearing_aid.astype(object) demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1 demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2 demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None print("\nhearing_aid:") print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts()) print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull()))) demographic["cochlear"] = 0 demographic.cochlear = demographic.cochlear.astype(object) demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1 demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2 demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None print("\ncochlear:") print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts()) print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull()))) print(len(demographic)) # Identify bilateral and bimodal individuals: # In[43]: demographic["unilateral_ci"] = demographic.cochlear==1 demographic["bilateral_ci"] = demographic.cochlear==2 demographic["bilateral_ha"] = demographic.hearing_aid==2 demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1) # In[44]: demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum() # In[45]: demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 'bilateral_ha', 'bimodal']].sum() # Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2) # In[46]: demographic['tech'] = 0 demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1 demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2 print("There are {0} null values for tech".format(sum(demographic.tech.isnull()))) # In[47]: demographic["implant_category"] = None demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 'implant_category'] = 0 demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 'implant_category'] = 1 demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 'implant_category'] = 2 demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 'implant_category'] = 3 demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 'implant_category'] = 4 demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 'implant_category'] = 5 demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 'implant_category'] = 6 demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 'implant_category'] = 7 demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 'implant_category'] = 8 demographic.implant_category.value_counts() # **Age when hearing loss diagnosed** Data are entered inconsistently here, so we have to go in and replace non-numeric values. # In[48]: demographic.onset_1.unique() # In[49]: # Don't need this anymore # demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, # 'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float) demographic['age_diag'] = demographic.onset_1 # Number of null values for `age_diag` # In[50]: demographic.age_diag.isnull().sum() # In[51]: demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'}) # In[52]: import seaborn as sb unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first() # ag = sb.factorplot("sex", data=unique_students, # palette="PuBuGn_d", kind='count') # ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), # 'Male ({})'.format((unique_students.male==1).sum())]) # ag.set_xlabels('') # In[162]: unique_students.shape # Child has another diagnosed disability # In[53]: demographic['known_synd'] = (demographic.synd_cause == 0) # Unknown or suspected demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None # In[54]: # If either known syndrome or secondary diagnosis demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1) # Missing sibling counts were properly encoded as `None` (missing). # In[55]: demographic.loc[demographic.sib==4, 'sib'] = None # We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing. # In[56]: races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"] demographic = demographic.rename(columns={"race":"_race"}) demographic["race"] = demographic._race.copy() demographic.loc[demographic.race==7, 'race'] = None demographic.loc[demographic.race>3, 'race'] = 4 print("_race:") print(demographic._race.value_counts()) print("race:") print(demographic.race.value_counts()) print("There are {0} null values for race".format(sum(demographic.race.isnull()))) # Replace with recoded column # Recode implant technology variables # In[57]: tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"] demographic["tech_right"] = demographic.tech_ad.copy() demographic.loc[demographic.tech_right==6, 'tech_right'] = 0 demographic.loc[demographic.tech_right==4, 'tech_right'] = 1 demographic.loc[demographic.tech_right==5, 'tech_right'] = 1 demographic.loc[demographic.tech_right==3, 'tech_right'] = 2 demographic.loc[demographic.tech_right==7, 'tech_right'] = 3 demographic.loc[demographic.tech_right==8, 'tech_right'] = 3 demographic.loc[demographic.tech_right==9, 'tech_right'] = 4 demographic.tech_right = np.abs(demographic.tech_right - 3) demographic["tech_left"] = demographic.tech_as.copy() demographic.loc[demographic.tech_left==6, 'tech_left'] = 0 demographic.loc[demographic.tech_left==4, 'tech_left'] = 1 demographic.loc[demographic.tech_left==5, 'tech_left'] = 1 demographic.loc[demographic.tech_left==3, 'tech_left'] = 2 demographic.loc[demographic.tech_left==7, 'tech_left'] = 3 demographic.loc[demographic.tech_left==8, 'tech_left'] = 3 demographic.loc[demographic.tech_left==9, 'tech_left'] = 4 demographic.tech_left = np.abs(demographic.tech_left - 3) # In[58]: # Don't need this anymore # demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, # 'n/a unilateral loss': np.nan, 'not amplified yet': np.nan, # 'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5, # '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float) # In[59]: demographic.academic_year.replace( {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', '2020-2011': '2010-2011', '2012-20013': '2012-2013', '642014-2015': '2014-2015', '20114-2015': '2014-2015', '2011-012': '2011-2012', '0000-0000': np.nan}).str.replace('*', '-').unique() # In[60]: demographic['academic_year'] = demographic.academic_year.replace( {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', '2020-2011': '2010-2011', '2012-20013': '2012-2013', '642014-2015': '2014-2015', '20114-2015': '2014-2015', '2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005', '2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015', '2015-2015': '2014-2015', '2009-2011': '2009-2010', '0000-0000': np.nan}).str.replace('*', '-') # Removed entries that don't contain dashes # In[61]: demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')), 'academic_year'] = np.nan # In[62]: demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split())) # In[63]: demographic.age_amp.hist() # ## Cleaning expressive vocabulary dataset # # We converted the expressive vocabulary dataset to "long" format: # In[64]: # Test type expressive["test_type"] = None EOWPVT = expressive.eowpvt_ss.notnull() EVT = expressive.evt_ss.notnull() expressive = expressive[EOWPVT | EVT] expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT" expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT" expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT" print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull()))) expressive["score"] = expressive.eowpvt_ss expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT] # In[65]: expressive.test_type.value_counts() # A `school` variable was added, which is the first four columns of the `study_id`: # In[66]: expressive["school"] = expressive.study_id.str.slice(0,4) # The age was taken to be the EOWPVT age if there are both test types: # In[67]: expressive["age_test"] = expressive.age_test_eowpvt expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()] # Finally, we dropped unwanted columns and added a domain identification column for merging: # In[68]: expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]] expressive["domain"] = "Expressive Vocabulary" expressive.head() # ## Cleaning receptive vocabulary dataset # We converted the receptive vocabulary data table to "long" format: # In[69]: receptive.columns # In[70]: # Test type receptive["test_type"] = None PPVT = receptive.ppvt_ss.notnull() ROWPVT = receptive.rowpvt_ss.notnull() receptive = receptive[PPVT | ROWPVT] receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT" receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT" receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT" print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull()))) receptive["score"] = receptive.ppvt_ss receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT] # A `school` variable was added, which is the first four columns of the `study_id`: # In[71]: receptive["school"] = receptive.study_id.str.slice(0,4) # The age was taken to be the PPVT age if there are both test types: # In[72]: receptive["age_test"] = receptive.age_test_ppvt receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()] # In[73]: print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull()))) # Finally, we dropped unwanted columns and added a domain identification column for merging: # In[74]: receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]] receptive["domain"] = "Receptive Vocabulary" receptive.head() # In[75]: receptive.study_id.unique().shape # ## Merge datasets # # The four datasets were mereged into a single table. First, we concatenate the test scores data: # In[76]: test_scores = pd.concat([articulation, expressive, receptive, language]) # Then we perform a merge between the demographic data and the test scores data: # In[77]: lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left') # In[78]: lsl_dr.tail() # In[79]: lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4]) lsl_dr.academic_year_start.value_counts() # In[80]: current_year_only = False if current_year_only: lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013'] # In[81]: expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score expressive_scores.hist(bins=25) plt.xlabel('Standard scores'); plt.ylabel('Frequency'); # In[82]: expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='expressive')].score expressive_lang_scores.hist(bins=25) plt.xlabel('Standard scores'); plt.ylabel('Frequency'); # Export dataset # In[83]: if current_year_only: lsl_dr.to_csv('lsl_dr_current_year.csv') else: lsl_dr.to_csv('lsl_dr.csv') # In[163]: lsl_dr.shape # In[164]: lsl_dr.study_id.unique().shape # In[86]: demographic.study_id.unique().shape # Convert score to floating-point number # In[87]: lsl_dr.score = lsl_dr.score.astype(float) # In[88]: lsl_dr['tech_class'] = 'Bimodal' lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI' lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA' # In[89]: lsl_dr['age_year'] = np.floor(lsl_dr.age/12.) # In[90]: lsl_dr.domain.dropna().unique() # In[91]: lsl_dr.groupby('tech_class').prim_lang.mean().round(2) # In[92]: lsl_dr['non_profound'] = lsl_dr.degree_hl<6 # In[93]: lsl_dr.groupby('tech_class').non_profound.mean().round(2) # In[138]: lsl_dr['age_test_year'] = -999 lsl_dr.loc[lsl_dr.age_test.notnull(), 'age_test_year'] = (lsl_dr.age_test/12).dropna().astype(int) lsl_dr.loc[lsl_dr.age_test_year==-999, 'age_test_year'] = np.nan # In[139]: f, axes = plt.subplots(2, 2, figsize=(14,10)) for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()): plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean') plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax) ax.set_ylim(40, 120) ax.set_xticks(range(2,7)) ax.set_title(dom) # ## PPVT # In[141]: ppvt_only = lsl_dr[lsl_dr.test_type=='PPVT'] ppvt_only.age_year.hist() # In[142]: ppvt_345 = ppvt_only[ppvt_only.age_test_year.isin([3,4,5])] # In[143]: ppvt_345.score.describe() # In[145]: ppvt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]}) # ## EVT # In[146]: lsl_dr.test_type.value_counts() # In[147]: evt_only = lsl_dr[lsl_dr.test_type=='EVT'] evt_only.age_test_year.hist() # In[148]: evt_345 = evt_only[evt_only.age_test_year.isin([3,4,5])] # In[149]: evt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]}) # ## PLS # In[151]: pls_only = (language[(language.test_name=='PLS')] .convert_objects(convert_numeric=True)) pls_only['age_year'] = np.floor(pls_only.age_test/12).astype(int) pls_345 = pls_only[pls_only.age_year.isin([3,4,5])] # In[152]: (pls_345.assign(normal_limits=pls_345.score>=85).groupby(['age_year', 'test_type']) .agg({'score':[min, max, np.median, len], 'normal_limits': np.mean})) # ## CELF # In[153]: celf_only = (language_subtest[(language_subtest.test_name=='CELF-P2')] .convert_objects(convert_numeric=True)) celf_only['age_year'] = np.floor(celf_only.age_test/12).astype(int) celf_46 = celf_only[celf_only.age_year.isin([4,6])] # In[154]: subtests = ['celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss', 'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss', 'celfp_wct_ss'] # In[155]: (celf_46.groupby('age_year') .agg({st:np.median for st in subtests})).T # ### Proportions in normal range # In[108]: def calc_norm_range(dataset): return (dataset.groupby('study_id').score.mean() >= 85).mean() # Mean score of each domain # In[156]: calc_norm_range(lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='expressive') & (lsl_dr.age_test_year.isin([3,4,5]))]) # In[157]: for year in range(2010, 2014): value = calc_norm_range(lsl_dr[(lsl_dr.domain=='Language') & (lsl_dr.test_type=='receptive') & (lsl_dr.academic_year_rv==year) & (lsl_dr.age_test_year.isin([3,4,5]))]).round(2) print('{}: {}'.format(year, value)) # In[158]: calc_norm_range(lsl_dr[(lsl_dr.domain=='Receptive Vocabulary') & (lsl_dr.age_test_year.isin([3,4,5]))]) # In[159]: calc_norm_range(lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.age_test_year.isin([3,4,5]))]) # In[160]: calc_norm_range(lsl_dr[(lsl_dr.domain=='Articulation') & (lsl_dr.age_test_year.isin([3,4,5]))]) # Summary statistics # In[189]: (lsl_dr.groupby('study_id').male.first().dropna()==0).mean() # In[197]: (lsl_dr.groupby('study_id').race.first().dropna()==0).mean() # In[204]: (lsl_dr.groupby('study_id').non_english.first().dropna()==False).sum() # In[208]: lsl_dr.groupby('study_id').sib.first().dropna().count() # In[213]: lsl_dr.groupby('study_id').onset_1.first().dropna().count() # In[215]: lsl_dr.groupby('study_id').age_amp.first().dropna().median() # In[218]: lsl_dr.groupby('study_id').age_int.first().dropna().median() # In[220]: lsl_dr.groupby('study_id').age.first().dropna().count() # In[247]: _unique = lsl_dr.dropna(subset=['age_disenrolled', 'age']).groupby('study_id').first() (_unique.age_disenrolled - _unique.age).count() # In[254]: synd_cause = lsl_dr.groupby('study_id').synd_cause.first().dropna() synd_cause = synd_cause[synd_cause<3] # In[257]: synd_cause.value_counts()/synd_cause.value_counts().sum() # In[262]: etiology = lsl_dr.groupby('study_id').etiology.first().dropna() etiology = etiology[etiology<3] # In[264]: etiology.value_counts()/etiology.value_counts().sum() # In[267]: lsl_dr['concerns'] = lsl_dr.etiology_2.replace({0:'none', 4:'none', 1:'mild', 2:'moderate', 3:'severe'}) # In[270]: lsl_dr.groupby('study_id').concerns.last().dropna().value_counts() # ### Plots of Demographic Data # In[271]: plot_color = "#64AAE8" # In[272]: def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, ylim=None, title=None, **kwargs): ax = kwargs.get('ax') if ax is None: f, ax = plt.subplots() counts = series.value_counts().sort_index() counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs) if xlim is None: ax.set_xlim(-0.5, len(counts)-0.5) if ylim is not None: ax.set_ylim(*ylim) ax.set_ylabel('Count') if labels is not None: ax.set_xticklabels(labels) if title: ax.set_title(title) for i,x in enumerate(counts): ax.annotate('%i' % x, (i, x + label_offset)) # plt.gca().tight_layout() # In[273]: unique_students = demographic.drop_duplicates('study_id') # In[274]: unique_students.shape # In[275]: unique_students.age.describe() # In[276]: plot_demo_data(unique_students.male, ('Female', 'Male'), label_offset=20, color=plot_color) # In[277]: plot_demo_data(unique_students.prim_lang, ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), rot=70, color=plot_color) # In[278]: unique_students.prim_lang.count() # In[279]: plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), color=plot_color) # In[280]: unique_students.sib.count() # In[281]: amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", "13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", "3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"] demographic.loc[demographic.age_amp==11, 'age_amp'] = None print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull()))) # In[282]: age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000], labels=amp_ages)) # In[283]: age_amp_counts = age_classes.value_counts()[amp_ages] age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color) plt.xlim(-0.5, len(age_amp_counts)-0.5) plt.ylabel('Count') plt.ylim(0,1000) for i,x in enumerate(age_amp_counts): plt.annotate('%i' % x, (i, x + 10)) # In[284]: age_amp_counts.sum() # In[285]: unique_students.age_amp.max() # In[286]: (unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color) plt.ylabel('Count') plt.xlabel('Age at amplification') # In[287]: plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color, ylim=(0, 3000)) # In[288]: plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500)) # In[289]: f, axes = plt.subplots(2, 1) plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, ax=axes[0], title='Right ear', color=plot_color) plot_demo_data(unique_students.tech_left, tech_cats, rot=90, ax=axes[1], title='Left ear', color=plot_color) # In[290]: unique_students.tech_right.count() # In[291]: unique_students.tech_left.count() # In[292]: degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \ 'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)' plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color) # In[293]: _, axes = plt.subplots(2, 1) plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, color=plot_color, ax=axes[0], title='Right ear') plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear'); # In[294]: unique_students.degree_hl_as.count() # In[295]: type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown' plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color) # In[296]: plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color) # In[297]: unique_students.type_hl_ad.count() # In[298]: unique_students.type_hl_as.count() # In[299]: type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown' f, axes = plt.subplots(2,1) plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, title='Right ear', ax=axes[0], color=plot_color) plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, title='Left ear', ax=axes[1], color=plot_color) # In[300]: demographic[demographic.study_id=='1147-2010-0064'] # In[301]: receptive[receptive.study_id=='1147-2010-0064'] # In[302]: lsl_dr[lsl_dr.study_id=='1147-2010-0064'] # In[303]: unique_students.type_hl_ad.count() # In[304]: receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape # In[305]: demographic.study_id.unique().shape # In[306]: receptive.study_id.unique().shape # In[307]: lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape # In[308]: receptive_ids = receptive.study_id.unique() # In[309]: demographic_ids = demographic.study_id.unique() # In[310]: [s for s in receptive_ids if s not in demographic_ids] # In[311]: def score_summary(domain, test_type=None): subset = lsl_dr[lsl_dr.domain==domain].copy() if test_type is not None: subset = subset[subset.test_type==test_type] subset['age_test'] = (subset.age_test/12).dropna().astype(int) subset.loc[subset.age_test > 11, 'age_test'] = 11 subset = subset[subset.age_test>1] byage = subset.groupby('age_test') n = byage.study_id.count() mean = byage.score.mean() sd = byage.score.std() min = byage.score.min() max = byage.score.max() summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 'SD':sd, 'Min':min, 'Max':max}) summary.index = summary.index.values.astype(int) return summary[['Sample Size','Mean','SD','Min','Max']] # In[312]: receptive_summary = score_summary("Receptive Vocabulary") receptive_summary # In[313]: receptive_summary.describe() # In[314]: receptive_summary['Sample Size'].sum() # In[315]: receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color) for i,x in enumerate(receptive_summary["Sample Size"]): plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center") plt.ylabel('Count') plt.xlabel('Age') # In[316]: expressive_summary = score_summary("Expressive Vocabulary") expressive_summary # In[317]: expressive_summary['Sample Size'].sum() # In[318]: expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color) for i,x in enumerate(expressive_summary["Sample Size"]): plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center") plt.ylabel('Count') plt.xlabel('Age') plt.xlim(-0.5, 9.5) if current_year_only: plt.ylim(0, 800) else: plt.ylim(0, 1800) # In[319]: articulation_summary = score_summary("Articulation") articulation_summary # In[320]: articulation_summary['Sample Size'].sum() # In[321]: sample_size = articulation_summary["Sample Size"] sample_size.plot(kind='bar', grid=False, color=plot_color) for i,x in enumerate(articulation_summary["Sample Size"]): plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center") plt.ylabel('Count') plt.xlabel('Age') plt.ylim(0, sample_size.max()+50); # Language scores # In[322]: lsl_dr.domain.unique() # In[323]: lsl_dr.test_type.unique() # In[324]: receptive_language_summary = score_summary("Language", "receptive") receptive_language_summary # In[325]: receptive_language_summary['Sample Size'].sum() # In[326]: sample_size = receptive_language_summary["Sample Size"] sample_size.plot(kind='bar', grid=False, color=plot_color) for i,x in enumerate(receptive_language_summary["Sample Size"]): plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center") plt.ylabel('Count') plt.xlabel('Age') plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5); # In[327]: expressive_language_summary = score_summary("Language", "expressive") expressive_language_summary # In[328]: expressive_language_summary['Sample Size'].sum() # In[329]: sample_size = expressive_language_summary["Sample Size"] sample_size.plot(kind='bar', grid=False, color=plot_color) for i,x in enumerate(expressive_language_summary["Sample Size"]): plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center") plt.ylabel('Count') plt.xlabel('Age') plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5); # In[330]: (unique_students.age/12.).describe() # In[331]: def calc_difference(x, col='a_fo', jitter=True): if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()): return None diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()] if jitter: diff += np.random.normal(scale=0.05) if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000: print(x['funct_out_age']) return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()}) # In[332]: audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist()) # In[333]: plt.figure(figsize=(10,6)) plt.scatter(audition.months, audition.difference, alpha=0.5) plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)') plt.title('Audition') # In[334]: slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist()) # In[335]: plt.figure(figsize=(10,6)) plt.scatter(slc.months, slc.difference, alpha=0.5) plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)') plt.title('Spoken language (comprehension)') # In[336]: sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist()) # In[337]: plt.figure(figsize=(10,6)) plt.scatter(sle.months, sle.difference, alpha=0.5) plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)') plt.title('Spoken language (expression)') # In[338]: lsl_dr.degree_hl.dropna().value_counts() # In[339]: ax = lsl_dr.degree_hl.hist(bins=7) # In[340]: diff = (lsl_dr['age'] - lsl_dr['age_int']) diff[diff>0].hist(bins=50) # In[341]: (lsl_dr.age_int<6).mean() # In[342]: (lsl_dr.age<6).mean() # Counts by year # In[343]: unique_students = lsl_dr.groupby('study_id').first() unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar') plt.ylabel('Frequency'); plt.xlabel('Academic year'); # In[344]: disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0) disab_by_year.columns = ['No', 'Yes'] disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True) # The following counts of ages allows for multiple tests per year # In[345]: test_age = (lsl_dr.assign(age_test_year=(lsl_dr.age_test/12)) .dropna(subset=['age_test'])[['study_id','age_test','age_test_year']]) test_age.assign(age_year=test_age.age_test_year.astype(int)).age_year.value_counts().sort_index() # This summary counts children only once per year: # In[346]: from itertools import chain unique_age_vals = (test_age.assign(age_year=test_age.age_test_year.astype(int)) .groupby('study_id') .age_year.unique().tolist()) pd.Series(np.concatenate(unique_age_vals)).value_counts().sort_index() # In[347]: test_age.assign(age_year=test_age.age_test_year.astype(int)).groupby('study_id').age_year.last().value_counts().sort_index() # In[ ]: