# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
Connect to database to import data for the three test domains and demographic information:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()
lsl_dr_project = Project(api_url, api_key)
metadata = lsl_dr_project.export_metadata()
# for i,j in zip(lsl_dr_project.field_names,
# lsl_dr_project.field_labels):
# print('{0}: \t{1}'.format(i,j))
Import each database from REDCap:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None})
records = lsl_dr_project.export_records(fields=articulation_fields)
print(records[0]['study_id'])
0101-2002-0101
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
'owls_lc_ss','owls_oe_ss','age_test_owls',
'celfp_rl_ss','celfp_el_ss','age_test_celp',
'celf_elss','celf_rlss','age_test_celf',
'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',
'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',
'celfp_wct_ss']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[888, 999, 9999]})
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
14329 | 1147-2010-0064 | initial_assessment_arm_1 | 2010-2011 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | ... | 3.0 | 6.0 | 65.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
14330 | 1147-2010-0064 | year_1_complete_71_arm_1 | 2011-2012 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3.0 | 5.0 | 77.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
14331 | 1147-2010-0064 | year_2_complete_71_arm_1 | 2012-2013 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 3.0 | 5.0 | 89.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
14332 | 1147-2010-0064 | year_3_complete_71_arm_1 | 2013-2014 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 5.0 | 101.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 rows × 46 columns
Several fields in the demographic data have missing values.
demographic_raw.head()
study_id | redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | ... | sle_fo | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 2002-2003 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 | 6.0 | ... | 2.0 | 2.0 | 54.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0101-2002-0101 | year_1_complete_71_arm_1 | 2003-2004 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 4.0 | 80.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 2004-2005 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | 4.0 | 80.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 0101-2002-0101 | year_3_complete_71_arm_1 | 2005-2006 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5.0 | 5.0 | 96.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 0101-2002-0101 | year_4_complete_71_arm_1 | 2006-2007 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | 5.0 | 5.0 | 109.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 46 columns
We can fill missing values forward from previous observation (by study_id
)
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) if __name__ == '__main__': /Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) app.launch_new_instance()
Random check to make sure this worked
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | premature_age | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
14329 | initial_assessment_arm_1 | 2010-2011 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 6.0 | 65.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
14330 | year_1_complete_71_arm_1 | 2011-2012 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 5.0 | 77.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
14331 | year_2_complete_71_arm_1 | 2012-2013 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 5.0 | 89.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
14332 | year_3_complete_71_arm_1 | 2013-2014 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 5.0 | 101.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1147-2010-0064 |
4 rows × 46 columns
Demographic data without missing values:
demographic.head()
redcap_event_name | academic_year | hl | gender | race | prim_lang | sib | mother_ed | father_ed | premature_age | ... | a_fo | fam_age | family_inv | att_days_sch | att_days_st2_417 | att_days_hr | demo_ses | school_lunch | medicaid | study_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | initial_assessment_arm_1 | 2002-2003 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 | 6.0 | 9.0 | ... | 2.0 | 54.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0101-2002-0101 |
7884 | initial_assessment_arm_1 | 2008-2009 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 5.0 | 5.0 | 7.0 | ... | 5.0 | 53.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2005-2156 |
7882 | initial_assessment_arm_1 | 2009-2010 | 0.0 | 1.0 | 2.0 | 0.0 | 3.0 | 6.0 | 6.0 | 8.0 | ... | 3.0 | 48.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2005-2081 |
7876 | initial_assessment_arm_1 | 2009-2010 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 3.0 | 6.0 | 5.0 | ... | 4.0 | 86.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2005-1986 |
7872 | initial_assessment_arm_1 | 2009-2010 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 6.0 | 6.0 | 8.0 | ... | 5.0 | 94.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 0628-2005-1978 |
5 rows × 46 columns
5 language measures:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()
language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls
language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()
language1["test_type"] = "receptive"
language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"
language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss
language2["test_type"] = "expressive"
language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"
language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss
language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type expressive receptive test_name CELF-4 611 537 CELF-P2 1448 1453 OWLS 1075 1081 PLS 3447 3459 There are 0 null values for score
A school
variable was added, which is the first four columns of the study_id
:
language["school"] = language.study_id.str.slice(0,4)
language_subtest = language[["study_id", "redcap_event_name", "score", "test_type",
"test_name", "school", "age_test",
'celfp_ss_ss', 'celfp_ws_ss',
'celfp_ev_ss', 'celfp_fd_ss',
'celfp_rs_ss', 'celfp_bc_ss',
'celfp_wcr_ss', 'celfp_wce_ss',
'celfp_wct_ss']]
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language |
We converted the articulation dataset into a "long" format:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 5286 Arizonia 502 Arizonia and Goldman 73 Name: test_type, dtype: int64 There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
articulation["school"] = articulation.study_id.str.slice(0,4)
The age was taken to be the Arizonia age if there are both test types:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count 5859.000000 mean 68.853559 std 30.782839 min 23.000000 25% 47.000000 50% 60.000000 75% 81.000000 max 243.000000 Name: age_test, dtype: float64
Finally, we dropped unwanted columns and added a domain identification column for merging:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
study_id | redcap_event_name | test_type | score | school | age_test | domain | |
---|---|---|---|---|---|---|---|
1 | 0101-2002-0101 | year_1_complete_71_arm_1 | Goldman | 78.0 | 0101 | 80.0 | Articulation |
9 | 0101-2003-0102 | initial_assessment_arm_1 | Goldman | 72.0 | 0101 | 44.0 | Articulation |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | Goldman | 97.0 | 0101 | 54.0 | Articulation |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | Goldman | 75.0 | 0101 | 53.0 | Articulation |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | Goldman | 80.0 | 0101 | 66.0 | Articulation |
We excluded unwanted columns and rows for which age, gender or race were missing:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})
Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False 11660 True 2590 Name: non_english, dtype: int64 There are 710 null values for non_english
Mother's education (mother_ed
) and father's education (father_ed
) were both recoded to:
Category 6 (unknown) was recoded as missing.
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed: 6.0 5198 4.0 3039 3.0 2053 5.0 1638 2.0 1436 1.0 491 0.0 215 Name: _mother_ed, dtype: int64 mother_ed: 1.0 3489 2.0 3039 3.0 1638 0.0 706 Name: mother_ed, dtype: int64 There are 6088 null values for mother_ed
Secondary diagnosis
demographic.shape
(14960, 48)
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
demographic.secondary_diagnosis.value_counts()
0.0 10979 1.0 2485 Name: secondary_diagnosis, dtype: int64
demographic.secondary_diagnosis.mean()
0.18456625074272134
Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3492 null values for premature_weeks
demographic.premature_weeks.value_counts()
0.0 9803 2.0 585 4.0 373 12.0 205 6.0 183 10.0 154 8.0 120 14.0 42 16.0 3 Name: premature_weeks, dtype: int64
Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):
demographic.tech_ad.value_counts()
1.0 5090 0.0 4379 7.0 1554 5.0 1022 2.0 519 6.0 426 8.0 76 9.0 70 4.0 29 3.0 28 10.0 3 Name: tech_ad, dtype: int64
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None
demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
demographic.tech_left.value_counts()
2.0 6754 3.0 4455 0.0 1877 4.0 60 1.0 20 Name: tech_left, dtype: int64
demographic.tech_right.value_counts()
2.0 6663 3.0 4881 0.0 1554 4.0 70 1.0 28 Name: tech_right, dtype: int64
Substitute valid missing values for hearing loss:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None
Create degree_hl
, which is the maximum level of hearing loss in either ear:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)
Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):
demographic.columns
Index(['redcap_event_name', 'academic_year', 'hl', 'male', 'race', 'prim_lang', 'sib', '_mother_ed', 'father_ed', 'premature_age', 'onset_1', 'age_amp', 'age_int', 'age', 'synd_cause', 'etiology', 'etiology_2', 'hearing_changes', 'ae', 'ad_250', 'ad_500', 'degree_hl_ad', 'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500', 'degree_hl_as', 'type_hl_as', 'tech_as', 'age_ci_2', 'time', 'age_disenrolled', 'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo', 'fam_age', 'family_inv', 'att_days_sch', 'att_days_st2_417', 'att_days_hr', 'demo_ses', 'school_lunch', 'medicaid', 'study_id', 'non_english', 'mother_ed', 'secondary_diagnosis', 'premature_weeks', 'tech_right', 'tech_left', 'degree_hl'], dtype='object')
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))
demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))
demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad: 0 4676 1 4 2 2 Name: oad, dtype: int64 There are 1711 null values for OAD hearing_aid: 2 2190 0 1648 1 813 Name: hearing_aid, dtype: int64 There are 1764 null values for hearing_aid cochlear: 0 3120 2 924 1 638 Name: cochlear, dtype: int64 There are 1711 null values for cochlear 14960
Identify bilateral and bimodal individuals:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
(3603, 5485, 1423, 2130)
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci',
'bilateral_ha',
'bimodal']].sum()
unilateral_ci 638 bilateral_ci 924 bilateral_ha 2190 bimodal 385 dtype: int64
Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0),
'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0),
'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1),
'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0),
'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1),
'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2),
'implant_category'] = 8
demographic.implant_category.value_counts()
6 5485 3 3603 4 1423 1 999 0 687 8 15 2 12 7 5 5 1 Name: implant_category, dtype: int64
Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.
demographic.onset_1.unique()
array([ 15. , 4. , 0. , 26. , 36. , 24. , 80. , 14. , 62. , 2. , 49. , 19. , 23. , 18. , 9. , nan, 10. , 12. , 1. , 5. , 30. , 7. , 51. , 8. , 3. , 17. , 50. , 31. , 34. , 28. , 35. , 38. , 95. , 42. , 13. , 16. , 61. , 46. , 22. , 53. , 59. , 88. , 6. , 37. , 96. , 52. , 64. , 65. , 48. , 97. , 25. , 47. , 79. , 107. , 74. , 77. , 84. , 60. , 41. , 33. , 39. , 27. , 11. , 20. , 21. , 45. , 29. , 32. , 81. , 1.5, 55. , 70. , 58. , 154. , 54. , 78. , 43. , 57. , 83. , 44. , 72. , 116. , 40. , 119. , 63. , 66. , 56. , 87. , 76. , 68. , 92. , 140. , 86. , 126. , 85. , 133. , 103. , 67. , 71. , 2.5, 98. , 75. , 0.5, 152. , 89. ])
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0,
# 'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1
Number of null values for age_diag
demographic.age_diag.isnull().sum()
3993
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
import seaborn as sb
unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()
# ag = sb.factorplot("sex", data=unique_students,
# palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()),
# 'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')
Child has another diagnosed disability
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)
Missing sibling counts were properly encoded as None
(missing).
demographic.loc[demographic.sib==4, 'sib'] = None
We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race: 0.0 7801 2.0 2554 1.0 1367 3.0 1044 6.0 725 8.0 531 7.0 239 4.0 65 5.0 33 Name: _race, dtype: int64 race: 0.0 7801 2.0 2554 1.0 1367 4.0 1354 3.0 1044 Name: race, dtype: int64 There are 840 null values for race
Recode implant technology variables
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]
demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)
demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan,
# 'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
# 'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
# '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'642014-2015': '2014-2015', '20114-2015': '2014-2015',
'2011-012': '2011-2012',
'0000-0000': np.nan}).str.replace('*', '-').unique()
array(['2002-2003', '2008-2009', '2009-2010', nan, '2009-2011', '2006-2007', '2007-2008', '2011-2012', '2015-2016', '2014-2015', '2013-2014', '2012-2013', '2010-2011', '2005-2006', '2014', '2012-', '2006-2007 ', '2003-2004', '2015-206', '2004-2005', ' 2010-2011 2010-2011', '2012', '2011', '2010', '2009', '2013', '1995-1996', '1998-1999', '2001-2002', '1999-2000', '2000-2001', '1997-1998', '2014-15', '2015', '2015-2015', '2014-2015 ', '2041-2015', '2015-2106', '22014-2015', '2014-1015', '2012-2013 '], dtype=object)
demographic['academic_year'] = demographic.academic_year.replace(
{'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011',
'2020-2011': '2010-2011', '2012-20013': '2012-2013',
'642014-2015': '2014-2015', '20114-2015': '2014-2015',
'2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
'2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
'2015-2015': '2014-2015', '2009-2011': '2009-2010',
'0000-0000': np.nan}).str.replace('*', '-')
Removed entries that don't contain dashes
demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')),
'academic_year'] = np.nan
demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))
demographic.age_amp.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10fd8e2b0>
We converted the expressive vocabulary dataset to "long" format:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))
expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
expressive.test_type.value_counts()
EVT 3812 EOWPVT 2707 EOWPVT and EVT 148 Name: test_type, dtype: int64
A school
variable was added, which is the first four columns of the study_id
:
expressive["school"] = expressive.study_id.str.slice(0,4)
The age was taken to be the EOWPVT age if there are both test types:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
Finally, we dropped unwanted columns and added a domain identification column for merging:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 58.0 | EOWPVT | 0101 | 54.0 | Expressive Vocabulary |
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 84.0 | EOWPVT | 0101 | 80.0 | Expressive Vocabulary |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 90.0 | EOWPVT | 0101 | 113.0 | Expressive Vocabulary |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | 90.0 | EOWPVT | 0101 | 53.0 | Expressive Vocabulary |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | 87.0 | EOWPVT | 0101 | 66.0 | Expressive Vocabulary |
We converted the receptive vocabulary data table to "long" format:
receptive.columns
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))
receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type
A school
variable was added, which is the first four columns of the study_id
:
receptive["school"] = receptive.study_id.str.slice(0,4)
The age was taken to be the PPVT age if there are both test types:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 27 null values for age_test
Finally, we dropped unwanted columns and added a domain identification column for merging:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 90.0 | PPVT | 0101 | 80.0 | Receptive Vocabulary |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 101.0 | ROWPVT | 0101 | 113.0 | Receptive Vocabulary |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55.0 | PPVT | 0101 | 44.0 | Receptive Vocabulary |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 80.0 | PPVT | 0101 | 54.0 | Receptive Vocabulary |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 101.0 | PPVT | 0101 | 68.0 | Receptive Vocabulary |
receptive.study_id.unique().shape
(3076,)
The four datasets were mereged into a single table. First, we concatenate the test scores data:
test_scores = pd.concat([articulation, expressive, receptive, language])
Then we perform a merge between the demographic data and the test scores data:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
lsl_dr.tail()
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | sex | known_synd | synd_or_disab | race | age_test | domain | school | score | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
38196 | year_9_complete_71_arm_1 | 2010-2011 | 0.0 | 1.0 | 3.0 | 2.0 | 1.0 | 4.0 | 4.0 | 9.0 | ... | Male | 1.0 | 1.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN |
38197 | year_9_complete_71_arm_1 | 2008-2009 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 3.0 | 2.0 | 8.0 | ... | Male | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
38198 | year_9_complete_71_arm_1 | 2013-2014 | 0.0 | 1.0 | 2.0 | 0.0 | NaN | 6.0 | 6.0 | 9.0 | ... | Male | 0.0 | 0.0 | 2.0 | 138 | Expressive Vocabulary | 0310 | 89 | NaN | EOWPVT |
38199 | year_9_complete_71_arm_1 | 2013-2014 | 0.0 | 1.0 | 2.0 | 0.0 | NaN | 6.0 | 6.0 | 9.0 | ... | Male | 0.0 | 0.0 | 2.0 | 138 | Receptive Vocabulary | 0310 | 82 | NaN | PPVT |
38200 | year_9_complete_71_arm_1 | 2011-2012 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 3.0 | 6.0 | 9.0 | ... | Male | 0.0 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 73 columns
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
2013 6940 2012 6650 2014 5821 2011 5245 2010 4445 nan 3077 2009 2455 2015 1167 2008 835 2007 533 2006 344 2005 286 2004 172 2003 90 2002 47 2001 37 1999 16 1998 16 2000 12 1997 6 2201 5 1995 1 2041 1 Name: academic_year_start, dtype: int64
current_year_only = False
if current_year_only:
lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language')
& (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
Export dataset
if current_year_only:
lsl_dr.to_csv('lsl_dr_current_year.csv')
else:
lsl_dr.to_csv('lsl_dr.csv')
lsl_dr.shape
(38201, 74)
lsl_dr.study_id.unique().shape
(5807,)
demographic.study_id.unique().shape
(5807,)
Convert score to floating-point number
lsl_dr.score = lsl_dr.score.astype(float)
lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'
lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)
lsl_dr.domain.dropna().unique()
array(['Expressive Vocabulary', 'Language', 'Articulation', 'Receptive Vocabulary'], dtype=object)
lsl_dr.groupby('tech_class').prim_lang.mean().round(2)
tech_class Bilateral CI 0.43 Bilateral HA 0.58 Bimodal 0.50 Name: prim_lang, dtype: float64
lsl_dr['non_profound'] = lsl_dr.degree_hl<6
lsl_dr.groupby('tech_class').non_profound.mean().round(2)
tech_class Bilateral CI 0.08 Bilateral HA 0.87 Bimodal 0.31 Name: non_profound, dtype: float64
f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
ax.set_ylim(40, 120)
ax.set_xticks(range(2,7))
ax.set_title(dom)
ppvt_only = lsl_dr[lsl_dr.test_type=='PPVT']
ppvt_only.age_year.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10d99beb8>
ppvt_345 = ppvt_only[ppvt_only.age_year.isin([3,4,5])]
ppvt_345.score.describe()
count 1978.000000 mean 89.923660 std 21.045408 min 20.000000 25% 77.000000 50% 90.000000 75% 105.000000 max 154.000000 Name: score, dtype: float64
ppvt_345.groupby('age_year').agg({'score':[min, max, np.median, np.count_nonzero]})
score | ||||
---|---|---|---|---|
min | max | median | count_nonzero | |
age_year | ||||
3.0 | 20.0 | 150.0 | 95.0 | 1196.0 |
4.0 | 20.0 | 154.0 | 87.0 | 481.0 |
5.0 | 20.0 | 130.0 | 81.0 | 301.0 |
lsl_dr.test_type.value_counts()
expressive 6581 receptive 6530 Goldman 5286 PPVT 4366 EVT 3812 EOWPVT 2707 ROWPVT 2272 Arizonia 502 PPVT and ROWPVT 197 EOWPVT and EVT 148 Arizonia and Goldman 73 Name: test_type, dtype: int64
evt_only = lsl_dr[lsl_dr.test_type=='EVT']
evt_only.age_year.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10c6e1550>
evt_345 = evt_only[evt_only.age_year.isin([3,4,5])]
evt_345.groupby('age_year').agg({'score':[min, max, np.median, np.count_nonzero]})
score | ||||
---|---|---|---|---|
min | max | median | count_nonzero | |
age_year | ||||
3.0 | 20.0 | 146.0 | 99.0 | 1095.0 |
4.0 | 20.0 | 146.0 | 90.0 | 415.0 |
5.0 | 20.0 | 130.0 | 85.0 | 273.0 |
pls_only = (language[(language.test_name=='PLS')]
.convert_objects(convert_numeric=True))
pls_only['age_year'] = np.floor(pls_only.age_test/12).astype(int)
pls_345 = pls_only[pls_only.age_year.isin([3,4,5])]
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric. from ipykernel import kernelapp as app
(pls_345.assign(normal_limits=pls_345.score>=85).groupby(['age_year', 'test_type'])
.agg({'score':[min, max, np.median, len],
'normal_limits': np.mean}))
score | normal_limits | |||||
---|---|---|---|---|---|---|
min | max | median | len | mean | ||
age_year | test_type | |||||
3 | expressive | 50.0 | 145.0 | 78.0 | 795.0 | 0.355975 |
receptive | 50.0 | 140.0 | 80.0 | 795.0 | 0.406289 | |
4 | expressive | 50.0 | 141.0 | 73.0 | 587.0 | 0.287905 |
receptive | 50.0 | 136.0 | 77.0 | 591.0 | 0.382403 | |
5 | expressive | 50.0 | 138.0 | 68.0 | 298.0 | 0.265101 |
receptive | 50.0 | 129.0 | 73.0 | 300.0 | 0.293333 |
celf_only = (language_subtest[(language_subtest.test_name=='CELF-P2')]
.convert_objects(convert_numeric=True))
celf_only['age_year'] = np.floor(celf_only.age_test/12).astype(int)
celf_46 = celf_only[celf_only.age_year.isin([4,6])]
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric. from ipykernel import kernelapp as app
subtests = ['celfp_ss_ss', 'celfp_ws_ss',
'celfp_ev_ss', 'celfp_fd_ss',
'celfp_rs_ss', 'celfp_bc_ss',
'celfp_wcr_ss', 'celfp_wce_ss',
'celfp_wct_ss']
(celf_46.groupby('age_year')
.agg({st:np.median for st in subtests})).T
age_year | 4 | 6 |
---|---|---|
celfp_bc_ss | 9.0 | 4.5 |
celfp_wce_ss | 9.0 | 7.0 |
celfp_ev_ss | 8.0 | 5.0 |
celfp_wcr_ss | 10.0 | 10.0 |
celfp_ss_ss | 8.0 | 5.0 |
celfp_ws_ss | 6.0 | 4.0 |
celfp_wct_ss | 9.0 | 8.0 |
celfp_rs_ss | 7.0 | 4.0 |
celfp_fd_ss | 8.0 | 4.0 |
plot_color = "#64AAE8"
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None,
ylim=None, title=None, **kwargs):
ax = kwargs.get('ax')
if ax is None:
f, ax = plt.subplots()
counts = series.value_counts().sort_index()
counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
if xlim is None:
ax.set_xlim(-0.5, len(counts)-0.5)
if ylim is not None:
ax.set_ylim(*ylim)
ax.set_ylabel('Count')
if labels is not None:
ax.set_xticklabels(labels)
if title:
ax.set_title(title)
for i,x in enumerate(counts):
ax.annotate('%i' % x, (i, x + label_offset))
# plt.gca().tight_layout()
unique_students = demographic.drop_duplicates('study_id')
unique_students.shape
(5807, 67)
unique_students.age.describe()
count 5290.00000 mean 29.50000 std 27.68008 min 0.00000 25% 8.00000 50% 24.00000 75% 40.00000 max 298.00000 Name: age, dtype: float64
plot_demo_data(unique_students.male,
('Female', 'Male'), label_offset=20, color=plot_color)
plot_demo_data(unique_students.prim_lang,
('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'),
rot=70, color=plot_color)
unique_students.prim_lang.count()
5242
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'),
color=plot_color)
unique_students.sib.count()
4846
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months",
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years",
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]
demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4911 null values for age_amp
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
labels=amp_ages))
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) if __name__ == '__main__':
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
for i,x in enumerate(age_amp_counts):
plt.annotate('%i' % x, (i, x + 10))
age_amp_counts.sum()
3627
unique_students.age_amp.max()
173.0
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
<matplotlib.text.Text at 0x10df86828>
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90,
ax=axes[0], title='Right ear', color=plot_color, ylim=(0, 2500))
plot_demo_data(unique_students.tech_left, tech_cats, rot=90,
ax=axes[1], title='Left ear', color=plot_color)
unique_students.tech_right.count()
4651
unique_students.tech_left.count()
4643
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90,
color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90,
color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
unique_students.degree_hl_as.count()
4550
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
unique_students.type_hl_ad.count()
4482
unique_students.type_hl_as.count()
4575
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90,
title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90,
title='Left ear', ax=axes[1], color=plot_color)
demographic[demographic.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | bilateral_ci | bilateral_ha | bimodal | tech | implant_category | age_diag | sex | known_synd | synd_or_disab | race | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
14329 | initial_assessment_arm_1 | 2010-2011 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | False | True | False | 0 | 6 | 51.0 | Female | 0.0 | 0.0 | 0.0 |
14330 | year_1_complete_71_arm_1 | 2011-2012 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | False | True | False | 0 | 6 | 51.0 | Female | 0.0 | 0.0 | 0.0 |
14331 | year_2_complete_71_arm_1 | 2012-2013 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | False | True | False | 0 | 6 | 51.0 | Female | 0.0 | 0.0 | 0.0 |
14332 | year_3_complete_71_arm_1 | 2013-2014 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | False | True | False | 0 | 6 | 51.0 | Female | 0.0 | 0.0 | 0.0 |
4 rows × 67 columns
receptive[receptive.study_id=='1147-2010-0064']
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
14329 | 1147-2010-0064 | initial_assessment_arm_1 | 96.0 | PPVT | 1147 | 63.0 | Receptive Vocabulary |
14330 | 1147-2010-0064 | year_1_complete_71_arm_1 | 91.0 | PPVT | 1147 | 73.0 | Receptive Vocabulary |
14331 | 1147-2010-0064 | year_2_complete_71_arm_1 | 93.0 | PPVT | 1147 | 85.0 | Receptive Vocabulary |
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
redcap_event_name | academic_year | hl | male | _race | prim_lang | sib | _mother_ed | father_ed | premature_age | ... | age_test | domain | school | score | test_name | test_type | academic_year_start | tech_class | age_year | non_profound | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5902 | initial_assessment_arm_1 | 2010-2011 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 63 | Expressive Vocabulary | 1147 | 91.0 | NaN | EVT | 2010 | Bilateral HA | 4.0 | True |
5903 | initial_assessment_arm_1 | 2010-2011 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 63 | Receptive Vocabulary | 1147 | 96.0 | NaN | PPVT | 2010 | Bilateral HA | 4.0 | True |
5904 | initial_assessment_arm_1 | 2010-2011 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 59 | Language | 1147 | 101.0 | PLS | receptive | 2010 | Bilateral HA | 4.0 | True |
5905 | initial_assessment_arm_1 | 2010-2011 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 59 | Language | 1147 | 87.0 | PLS | expressive | 2010 | Bilateral HA | 4.0 | True |
14321 | year_1_complete_71_arm_1 | 2011-2012 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 72 | Expressive Vocabulary | 1147 | 86.0 | NaN | EVT | 2011 | Bilateral HA | 4.0 | True |
14322 | year_1_complete_71_arm_1 | 2011-2012 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 73 | Receptive Vocabulary | 1147 | 91.0 | NaN | PPVT | 2011 | Bilateral HA | 4.0 | True |
24001 | year_2_complete_71_arm_1 | 2012-2013 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 88 | Expressive Vocabulary | 1147 | 95.0 | NaN | EVT | 2012 | Bilateral HA | 4.0 | True |
24002 | year_2_complete_71_arm_1 | 2012-2013 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | 85 | Receptive Vocabulary | 1147 | 93.0 | NaN | PPVT | 2012 | Bilateral HA | 4.0 | True |
30342 | year_3_complete_71_arm_1 | 2013-2014 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 3.0 | 8.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2013 | Bilateral HA | 4.0 | True |
9 rows × 77 columns
unique_students.type_hl_ad.count()
4482
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
(3076,)
demographic.study_id.unique().shape
(5807,)
receptive.study_id.unique().shape
(3076,)
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
(3076,)
receptive_ids = receptive.study_id.unique()
demographic_ids = demographic.study_id.unique()
[s for s in receptive_ids if s not in demographic_ids]
[]
def score_summary(domain, test_type=None):
subset = lsl_dr[lsl_dr.domain==domain].copy()
if test_type is not None:
subset = subset[subset.test_type==test_type]
subset['age_test'] = (subset.age_test/12).dropna().astype(int)
subset.loc[subset.age_test > 11, 'age_test'] = 11
subset = subset[subset.age_test>1]
byage = subset.groupby('age_test')
n = byage.study_id.count()
mean = byage.score.mean()
sd = byage.score.std()
min = byage.score.min()
max = byage.score.max()
summary = pd.DataFrame({'Sample Size':n, 'Mean':mean,
'SD':sd, 'Min':min, 'Max':max})
summary.index = summary.index.values.astype(int)
return summary[['Sample Size','Mean','SD','Min','Max']]
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 412 | 93.546117 | 18.140445 | 40.0 | 144.0 |
3 | 1428 | 92.067227 | 19.347476 | 0.0 | 150.0 |
4 | 1547 | 90.796380 | 20.277519 | 0.0 | 149.0 |
5 | 1161 | 89.919897 | 18.110998 | 0.0 | 142.0 |
6 | 652 | 85.914110 | 16.302309 | 40.0 | 154.0 |
7 | 424 | 83.169811 | 16.066041 | 40.0 | 130.0 |
8 | 304 | 80.700658 | 17.624780 | 20.0 | 132.0 |
9 | 227 | 78.193833 | 17.638889 | 25.0 | 160.0 |
10 | 191 | 76.324607 | 17.481099 | 20.0 | 123.0 |
11 | 459 | 78.588235 | 18.949552 | 20.0 | 134.0 |
receptive_summary.describe()
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
count | 10.000000 | 10.000000 | 10.000000 | 10.0000 | 10.000000 |
mean | 680.500000 | 84.922087 | 17.993911 | 20.5000 | 141.800000 |
std | 507.197365 | 6.377836 | 1.295244 | 16.4063 | 11.802071 |
min | 191.000000 | 76.324607 | 16.066041 | 0.0000 | 123.000000 |
25% | 331.000000 | 79.116341 | 17.517019 | 5.0000 | 132.500000 |
50% | 441.500000 | 84.541961 | 17.874943 | 20.0000 | 143.000000 |
75% | 1033.750000 | 90.577259 | 18.747275 | 36.2500 | 149.750000 |
max | 1547.000000 | 93.546117 | 20.277519 | 40.0000 | 160.000000 |
receptive_summary['Sample Size'].sum()
6805
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
<matplotlib.text.Text at 0x1105ff320>
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 390 | 92.753846 | 22.081898 | 23.0 | 145.0 |
3 | 1376 | 93.390262 | 21.591975 | 0.0 | 145.0 |
4 | 1525 | 92.449180 | 21.817895 | 0.0 | 146.0 |
5 | 1136 | 91.602113 | 20.054994 | 0.0 | 145.0 |
6 | 650 | 87.018462 | 18.442505 | 20.0 | 146.0 |
7 | 425 | 84.037647 | 15.699522 | 38.0 | 131.0 |
8 | 295 | 84.037288 | 16.455319 | 34.0 | 122.0 |
9 | 213 | 81.793427 | 16.060750 | 36.0 | 145.0 |
10 | 185 | 81.816216 | 15.279596 | 40.0 | 122.0 |
11 | 460 | 84.821739 | 17.366822 | 18.0 | 146.0 |
expressive_summary['Sample Size'].sum()
6655
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
plt.ylim(0, 800)
else:
plt.ylim(0, 1800)
articulation_summary = score_summary("Articulation")
articulation_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 297 | 85.225589 | 14.870600 | 50.0 | 122.0 |
3 | 1194 | 83.618090 | 18.348582 | 40.0 | 126.0 |
4 | 1368 | 83.526316 | 20.745277 | 0.0 | 123.0 |
5 | 1065 | 83.881690 | 34.906576 | 39.0 | 999.0 |
6 | 614 | 79.534202 | 21.707111 | 39.0 | 115.0 |
7 | 402 | 80.402985 | 51.064887 | 3.0 | 999.0 |
8 | 259 | 78.876448 | 21.283801 | 40.0 | 107.0 |
9 | 188 | 81.617021 | 20.547639 | 40.0 | 109.0 |
10 | 145 | 81.317241 | 20.068184 | 40.0 | 105.0 |
11 | 324 | 84.632716 | 54.558552 | 39.0 | 999.0 |
articulation_summary['Sample Size'].sum()
5856
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);
Language scores
lsl_dr.domain.unique()
array(['Expressive Vocabulary', 'Language', 'Articulation', nan, 'Receptive Vocabulary'], dtype=object)
lsl_dr.test_type.unique()
array(['EOWPVT', 'receptive', 'expressive', 'Goldman', nan, 'EVT', 'PPVT', 'Arizonia', 'ROWPVT', 'Arizonia and Goldman', 'EOWPVT and EVT', 'PPVT and ROWPVT'], dtype=object)
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 957 | 86.323929 | 22.295176 | 50.0 | 150.0 |
3 | 1374 | 84.938137 | 19.634047 | 50.0 | 144.0 |
4 | 1349 | 85.316531 | 19.507433 | 43.0 | 145.0 |
5 | 962 | 83.939709 | 18.839663 | 47.0 | 140.0 |
6 | 495 | 78.078788 | 17.673256 | 11.0 | 127.0 |
7 | 321 | 75.981308 | 18.835628 | 40.0 | 123.0 |
8 | 199 | 74.989950 | 19.793885 | 40.0 | 123.0 |
9 | 54 | 70.425926 | 21.219075 | 40.0 | 120.0 |
10 | 46 | 79.413043 | 20.985261 | 40.0 | 120.0 |
11 | 67 | 76.522388 | 21.469046 | 40.0 | 139.0 |
receptive_language_summary['Sample Size'].sum()
5824
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Sample Size | Mean | SD | Min | Max | |
---|---|---|---|---|---|
2 | 950 | 88.427368 | 18.557020 | 50.0 | 150.0 |
3 | 1375 | 82.408000 | 17.458500 | 20.0 | 147.0 |
4 | 1341 | 80.609247 | 19.553739 | 45.0 | 141.0 |
5 | 983 | 78.691760 | 20.189772 | 45.0 | 144.0 |
6 | 513 | 71.773879 | 19.234357 | 6.0 | 140.0 |
7 | 343 | 67.128280 | 20.948304 | 40.0 | 124.0 |
8 | 205 | 68.014634 | 21.506834 | 40.0 | 118.0 |
9 | 54 | 65.629630 | 21.286275 | 40.0 | 108.0 |
10 | 46 | 77.217391 | 24.107088 | 40.0 | 119.0 |
11 | 66 | 73.939394 | 22.574239 | 40.0 | 132.0 |
expressive_language_summary['Sample Size'].sum()
5876
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
(unique_students.age/12.).describe()
count 5290.000000 mean 2.458333 std 2.306673 min 0.000000 25% 0.666667 50% 2.000000 75% 3.333333 max 24.833333 Name: age, dtype: float64
def calc_difference(x, col='a_fo', jitter=True):
if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
return None
diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
if jitter:
diff += np.random.normal(scale=0.05)
if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
print(x['funct_out_age'])
return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
<matplotlib.text.Text at 0x10c88fdd8>
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
<matplotlib.text.Text at 0x10c89c2e8>
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
<matplotlib.text.Text at 0x11011b6d8>
lsl_dr.degree_hl.dropna().value_counts()
6.0 17270 4.0 4580 3.0 4446 5.0 4246 2.0 1740 0.0 1271 1.0 301 Name: degree_hl, dtype: int64
ax = lsl_dr.degree_hl.hist(bins=7)
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x11014fda0>
(lsl_dr.age_int<6).mean()
0.20227219182743908
(lsl_dr.age<6).mean()
0.13415879165466874
Counts by year
unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');
disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x10c8b7630>