In [1]:
# Import modules and set options
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Connect to database to import data for the three test domains and demographic information:

In [2]:
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)
In [3]:
metadata = lsl_dr_project.export_metadata()
In [4]:
# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))

Import each database from REDCap:

In [5]:
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None,
                                                                                                'na_values':[999, 9999]})
In [6]:
records = lsl_dr_project.export_records(fields=articulation_fields)
In [7]:
print(records[0]['study_id'])
0101-2002-0101
In [8]:
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})
In [9]:
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})
In [10]:
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf',
                   'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',
                   'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',
                   'celfp_wct_ss']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})
In [11]:
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year', 'academic_year_rv',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})
In [12]:
demographic_raw.academic_year_rv.value_counts()
Out[12]:
2013.0    2501
2012.0    2429
2014.0    2170
2011.0    1901
2010.0    1609
2009.0    1021
2015.0     931
2008.0     436
2007.0     277
2006.0     189
2005.0     138
2004.0      89
2003.0      65
2002.0      36
2001.0      24
2000.0      12
1999.0      12
1998.0       9
15.0         3
1997.0       2
1995.0       1
Name: academic_year_rv, dtype: int64
In [13]:
demographic_raw[demographic_raw.study_id=='1147-2010-0064']
Out[13]:
study_id redcap_event_name academic_year academic_year_rv hl gender race prim_lang sib mother_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
14665 1147-2010-0064 initial_assessment_arm_1 2010-2011 2010.0 0.0 0.0 0.0 0.0 1.0 3.0 ... 3.0 6.0 65.0 0.0 NaN NaN NaN NaN NaN NaN
14666 1147-2010-0064 year_1_complete_71_arm_1 2011-2012 2011.0 0.0 NaN NaN NaN NaN NaN ... 3.0 5.0 77.0 2.0 NaN NaN NaN NaN NaN NaN
14667 1147-2010-0064 year_2_complete_71_arm_1 2012-2013 2012.0 0.0 NaN NaN NaN NaN NaN ... 3.0 5.0 89.0 2.0 NaN NaN NaN NaN NaN NaN
14668 1147-2010-0064 year_3_complete_71_arm_1 2013-2014 2013.0 0.0 NaN NaN NaN NaN NaN ... 4.0 5.0 101.0 2.0 NaN NaN NaN NaN NaN NaN

4 rows × 47 columns

Attendance information

Several fields in the demographic data have missing values.

In [14]:
demographic_raw.head()
Out[14]:
study_id redcap_event_name academic_year academic_year_rv hl gender race prim_lang sib mother_ed ... sle_fo a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid
0 0101-2002-0101 initial_assessment_arm_1 2002-2003 2002.0 0.0 0.0 0.0 0.0 1.0 6.0 ... 2.0 2.0 54.0 2.0 NaN NaN NaN NaN NaN NaN
1 0101-2002-0101 year_1_complete_71_arm_1 2003-2004 2003.0 0.0 NaN NaN NaN NaN NaN ... 4.0 4.0 80.0 1.0 NaN NaN NaN NaN NaN NaN
2 0101-2002-0101 year_2_complete_71_arm_1 2004-2005 2004.0 0.0 NaN NaN NaN NaN NaN ... 4.0 4.0 80.0 2.0 NaN NaN NaN NaN NaN NaN
3 0101-2002-0101 year_3_complete_71_arm_1 2005-2006 2005.0 0.0 NaN NaN NaN NaN NaN ... 5.0 5.0 96.0 3.0 NaN NaN NaN NaN NaN NaN
4 0101-2002-0101 year_4_complete_71_arm_1 2006-2007 2006.0 0.0 NaN NaN NaN NaN NaN ... 5.0 5.0 109.0 2.0 NaN NaN NaN NaN NaN NaN

5 rows × 47 columns

We can fill missing values forward from previous observation (by study_id)

In [15]:
demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()

Random check to make sure this worked

In [16]:
demographic[demographic.study_id=='1147-2010-0064']
Out[16]:
redcap_event_name academic_year academic_year_rv hl gender race prim_lang sib mother_ed father_ed ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
14665 initial_assessment_arm_1 2010-2011 2010.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 6.0 65.0 0.0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
14666 year_1_complete_71_arm_1 2011-2012 2011.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 5.0 77.0 2.0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
14667 year_2_complete_71_arm_1 2012-2013 2012.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 5.0 89.0 2.0 NaN NaN NaN NaN NaN NaN 1147-2010-0064
14668 year_3_complete_71_arm_1 2013-2014 2013.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 5.0 101.0 2.0 NaN NaN NaN NaN NaN NaN 1147-2010-0064

4 rows × 47 columns

Demographic data without missing values:

In [17]:
demographic.head()
Out[17]:
redcap_event_name academic_year academic_year_rv hl gender race prim_lang sib mother_ed father_ed ... a_fo fam_age family_inv att_days_sch att_days_st2_417 att_days_hr demo_ses school_lunch medicaid study_id
0 initial_assessment_arm_1 2002-2003 2002.0 0.0 0.0 0.0 0.0 1.0 6.0 6.0 ... 2.0 54.0 2.0 NaN NaN NaN NaN NaN NaN 0101-2002-0101
8001 initial_assessment_arm_1 2009-2010 2009.0 0.0 0.0 0.0 0.0 1.0 5.0 3.0 ... 5.0 138.0 0.0 NaN NaN NaN NaN NaN NaN 0628-2005-1814
7995 initial_assessment_arm_1 2009-2010 2009.0 0.0 0.0 6.0 0.0 0.0 4.0 3.0 ... 4.0 78.0 0.0 NaN NaN NaN NaN NaN NaN 0628-2005-1756
7990 initial_assessment_arm_1 2009-2010 2009.0 0.0 0.0 1.0 0.0 1.0 3.0 4.0 ... 4.0 77.0 0.0 NaN NaN NaN NaN NaN NaN 0628-2005-1744
7987 initial_assessment_arm_1 2009-2010 2009.0 0.0 1.0 1.0 0.0 2.0 6.0 6.0 ... 4.0 118.0 4.0 NaN NaN NaN NaN NaN NaN 0628-2005-1741

5 rows × 47 columns

Cleaning languge dataset

5 language measures:

  • 3 versions of CELF
  • PLS
    • pls_ac_rs: PLS: Auditory Comprehension Raw Score
    • pls_ac_ss: PLS: Auditory Comprehension Standard Score
    • pls_ec_rs: PLS: Expressive Communication Raw Score
    • pls_ec_ss: PLS: Expressive Communication Standard Score
    • pls_tl_rs: PLS: Total Language Score Standard Score Total
    • pls_tl_ss: PLS: Total Language Score Standard Score
  • OWLS
    • age_test_owls: Age at time of testing (OWLS)
    • owls_lc_rs: OWLS: Listening Comprehension Raw Score
    • owls_lc_ss: OWLS: Listening Comprehension Standard Score
    • owls_oe_rs: OWLS: Oral Expression Raw Score
    • owls_oe_ss: OWLS: Oral Expression Standard Score
    • owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
    • owls_oc_ss: OWLS: Oral Composite Standard Score
    • owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
    • owls_wes_as: OWLS: Written Expression Scale Ability Score
    • owls_wes_ss: OWLS: Written Expression Scale Standard Score
    • owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
    • owls_lcss: OWLS: Language Composite Standard Score
In [18]:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type  expressive  receptive
test_name                       
CELF-4            627        545
CELF-P2          1511       1516
OWLS             1093       1099
PLS              3572       3584
There are 0 null values for score

A school variable was added, which is the first four columns of the study_id:

In [19]:
language["school"] = language.study_id.str.slice(0,4)
In [20]:
language_subtest = language[["study_id", "redcap_event_name", "score", "test_type", 
                             "test_name", "school", "age_test", 
                             'celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']]
In [21]:
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
Out[21]:
study_id redcap_event_name score test_type test_name school age_test domain
0 0101-2002-0101 initial_assessment_arm_1 51 receptive PLS 0101 54 Language
5 0101-2002-0101 year_5_complete_71_arm_1 61 receptive OWLS 0101 113 Language
9 0101-2003-0102 initial_assessment_arm_1 55 receptive PLS 0101 44 Language
10 0101-2003-0102 year_1_complete_71_arm_1 77 receptive PLS 0101 54 Language
11 0101-2003-0102 year_2_complete_71_arm_1 93 receptive CELF-P2 0101 68 Language

Cleaning articulation dataset

We converted the articulation dataset into a "long" format:

In [22]:
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman                 5437
Arizonia                 503
Arizonia and Goldman      73
Name: test_type, dtype: int64
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [23]:
articulation["school"] = articulation.study_id.str.slice(0,4)

The age was taken to be the Arizonia age if there are both test types:

In [24]:
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count    6011.000000
mean       68.857095
std        30.613506
min        23.000000
25%        47.000000
50%        60.000000
75%        81.000000
max       243.000000
Name: age_test, dtype: float64

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [25]:
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
Out[25]:
study_id redcap_event_name test_type score school age_test domain
1 0101-2002-0101 year_1_complete_71_arm_1 Goldman 78.0 0101 80.0 Articulation
9 0101-2003-0102 initial_assessment_arm_1 Goldman 72.0 0101 44.0 Articulation
10 0101-2003-0102 year_1_complete_71_arm_1 Goldman 97.0 0101 54.0 Articulation
14 0101-2004-0101 year_2_complete_71_arm_1 Goldman 75.0 0101 53.0 Articulation
15 0101-2004-0101 year_3_complete_71_arm_1 Goldman 80.0 0101 66.0 Articulation

Cleaning demographic dataset

We excluded unwanted columns and rows for which age, gender or race were missing:

In [26]:
# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})

Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

In [27]:
demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))
False    11986
True      2688
Name: non_english, dtype: int64
There are 622 null values for non_english

Mother's education (mother_ed) and father's education (father_ed) were both recoded to:

  • 0=no high school diploma
  • 1=high school
  • 2=undergraduate
  • 3=graduate

Category 6 (unknown) was recoded as missing.

In [28]:
demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))
_mother_ed:
6.0    5340
4.0    3140
3.0    2127
5.0    1696
2.0    1489
1.0     498
0.0     222
Name: _mother_ed, dtype: int64
mother_ed:
1.0    3616
2.0    3140
3.0    1696
0.0     720
Name: mother_ed, dtype: int64

There are 6124 null values for mother_ed

Secondary diagnosis

In [29]:
demographic.shape
Out[29]:
(15296, 49)
In [30]:
demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None
In [31]:
demographic.secondary_diagnosis.value_counts()
Out[31]:
0.0    11224
1.0     2526
Name: secondary_diagnosis, dtype: int64
In [32]:
demographic.secondary_diagnosis.mean()
Out[32]:
0.18370909090909091

Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

In [33]:
demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))
There are 3394 null values for premature_weeks
In [34]:
demographic.premature_weeks.value_counts()
Out[34]:
0.0     10190
2.0       609
4.0       386
12.0      202
6.0       186
10.0      160
8.0       124
14.0       42
16.0        3
Name: premature_weeks, dtype: int64

Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

In [35]:
demographic.tech_ad.value_counts()
Out[35]:
1.0     5221
0.0     4497
7.0     1588
5.0     1056
2.0      529
6.0      433
8.0       76
9.0       70
4.0       31
3.0       26
10.0       4
Name: tech_ad, dtype: int64
In [36]:
tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None

demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None
In [37]:
demographic.tech_left.value_counts()
Out[37]:
2.0    6919
3.0    4579
0.0    1925
4.0      61
1.0      18
Name: tech_left, dtype: int64
In [38]:
demographic.tech_right.value_counts()
Out[38]:
2.0    6841
3.0    5006
0.0    1588
4.0      70
1.0      26
Name: tech_right, dtype: int64

Substitute valid missing values for hearing loss:

In [39]:
demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None

Create degree_hl, which is the maximum level of hearing loss in either ear:

In [40]:
demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)

Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant):

  • 0=none
  • 1=one ear
  • 2=both ears.
In [41]:
demographic.columns
Out[41]:
Index(['redcap_event_name', 'academic_year', 'academic_year_rv', 'hl', 'male',
       'race', 'prim_lang', 'sib', '_mother_ed', 'father_ed', 'premature_age',
       'onset_1', 'age_amp', 'age_int', 'age', 'synd_cause', 'etiology',
       'etiology_2', 'hearing_changes', 'ae', 'ad_250', 'ad_500',
       'degree_hl_ad', 'type_hl_ad', 'tech_ad', 'age_ci', 'as_250', 'as_500',
       'degree_hl_as', 'type_hl_as', 'tech_as', 'age_ci_2', 'time',
       'age_disenrolled', 'funct_out_age', 'slc_fo', 'sle_fo', 'a_fo',
       'fam_age', 'family_inv', 'att_days_sch', 'att_days_st2_417',
       'att_days_hr', 'demo_ses', 'school_lunch', 'medicaid', 'study_id',
       'non_english', 'mother_ed', 'secondary_diagnosis', 'premature_weeks',
       'tech_right', 'tech_left', 'degree_hl'],
      dtype='object')
In [42]:
demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))
oad:
0    4770
1       4
Name: oad, dtype: int64
There are 1711 null values for OAD

hearing_aid:
2    2249
0    1669
1     824
Name: hearing_aid, dtype: int64
There are 1765 null values for hearing_aid

cochlear:
0    3203
2     935
1     636
Name: cochlear, dtype: int64
There are 1711 null values for cochlear
15296

Identify bilateral and bimodal individuals:

In [43]:
demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)
In [44]:
demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()
Out[44]:
(3718, 5632, 1437, 2149)
In [45]:
demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()
Out[45]:
unilateral_ci     636
bilateral_ci      935
bilateral_ha     2249
bimodal           384
dtype: int64

Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

In [46]:
demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))
There are 0 null values for tech
In [47]:
demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()
Out[47]:
6    5632
3    3718
4    1437
1    1034
0     692
8      13
2      12
7       5
5       1
Name: implant_category, dtype: int64

Age when hearing loss diagnosed Data are entered inconsistently here, so we have to go in and replace non-numeric values.

In [48]:
demographic.onset_1.unique()
Out[48]:
array([  15. ,   80. ,   14. ,   62. ,    2. ,   49. ,   19. ,    9. ,
         18. ,    4. ,    0. ,   10. ,   12. ,    1. ,   31. ,   16. ,
         26. ,   61. ,   46. ,   24. ,   36. ,   21. ,   52. ,   30. ,
          7. ,   51. ,    8. ,    3. ,    6. ,   17. ,   50. ,   23. ,
         42. ,   37. ,   33. ,   60. ,   13. ,    nan,   22. ,   28. ,
         82. ,   34. ,   35. ,   38. ,   95. ,    5. ,   59. ,   25. ,
         48. ,    1.5,   41. ,   53. ,   88. ,   29. ,   27. ,   39. ,
         65. ,   64. ,   47. ,   79. ,   97. ,   96. ,  107. ,   77. ,
         74. ,   11. ,   84. ,   20. ,   45. ,   32. ,   81. ,   55. ,
         58. ,   70. ,  154. ,   54. ,   57. ,   72. ,   43. ,   83. ,
         78. ,  116. ,   40. ,   44. ,  119. ,   63. ,   66. ,  140. ,
         56. ,   87. ,   76. ,   68. ,   92. ,   86. ,  126. ,   85. ,
        133. ,  103. ,   67. ,   71. ,    2.5,   98. ,   75. ,    0.5,
         89. ,  152. ])
In [49]:
# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1

Number of null values for age_diag

In [50]:
demographic.age_diag.isnull().sum()
Out[50]:
3864
In [51]:
demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})
In [52]:
import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')
In [162]:
unique_students.shape
Out[162]:
(5522, 64)

Child has another diagnosed disability

In [53]:
demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None
In [54]:
# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)

Missing sibling counts were properly encoded as None (missing).

In [55]:
demographic.loc[demographic.sib==4, 'sib'] = None

We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

In [56]:
races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column
_race:
0.0    7955
2.0    2649
1.0    1407
3.0    1074
6.0     751
8.0     542
7.0     241
4.0      66
5.0      37
Name: _race, dtype: int64
race:
0.0    7955
2.0    2649
1.0    1407
4.0    1396
3.0    1074
Name: race, dtype: int64
There are 815 null values for race

Recode implant technology variables

In [57]:
tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)
In [58]:
# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)
In [59]:
demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012',
                                   '0000-0000': np.nan}).str.replace('*', '-').unique()
Out[59]:
array(['2002-2003', '2009-2010', '2011-2012', '2009-2011', '2006-2007',
       '2007-2008', '2008-2009', '2014-2015', '2013-2014', '2012-2013',
       nan, '2015-2016', '2010-2011', '2014', '2005-2006', '2004-2005',
       '2003-2004',
       '2010-2011                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   2010-2011',
       '2011', '2010', '2009', '2012', '2013', '1995-1996', '1999-2000',
       '2000-2001', '1998-1999', '1997-1998', '2001-2002', '2014-15',
       '2015-2015', '2015', '2041-2015', '2015-2106', '22014-2015',
       '2014-1015'], dtype=object)
In [60]:
demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
                          '2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
                          '2015-2015': '2014-2015', '2009-2011': '2009-2010',
                                   '0000-0000': np.nan}).str.replace('*', '-')

Removed entries that don't contain dashes

In [61]:
demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')), 
                'academic_year'] = np.nan
In [62]:
demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))
In [63]:
demographic.age_amp.hist()
Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a63eeb8>

Cleaning expressive vocabulary dataset

We converted the expressive vocabulary dataset to "long" format:

In [64]:
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
In [65]:
expressive.test_type.value_counts()
Out[65]:
EVT               3881
EOWPVT            2784
EOWPVT and EVT     149
Name: test_type, dtype: int64

A school variable was added, which is the first four columns of the study_id:

In [66]:
expressive["school"] = expressive.study_id.str.slice(0,4)

The age was taken to be the EOWPVT age if there are both test types:

In [67]:
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [68]:
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
Out[68]:
study_id redcap_event_name score test_type school age_test domain
0 0101-2002-0101 initial_assessment_arm_1 58.0 EOWPVT 0101 54.0 Expressive Vocabulary
2 0101-2002-0101 year_2_complete_71_arm_1 84.0 EOWPVT 0101 80.0 Expressive Vocabulary
5 0101-2002-0101 year_5_complete_71_arm_1 90.0 EOWPVT 0101 113.0 Expressive Vocabulary
14 0101-2004-0101 year_2_complete_71_arm_1 90.0 EOWPVT 0101 53.0 Expressive Vocabulary
15 0101-2004-0101 year_3_complete_71_arm_1 87.0 EOWPVT 0101 66.0 Expressive Vocabulary

Cleaning receptive vocabulary dataset

We converted the receptive vocabulary data table to "long" format:

In [69]:
receptive.columns
Out[69]:
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss',
       'age_test_rowpvt', 'rowpvt_ss'],
      dtype='object')
In [70]:
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type

A school variable was added, which is the first four columns of the study_id:

In [71]:
receptive["school"] = receptive.study_id.str.slice(0,4)

The age was taken to be the PPVT age if there are both test types:

In [72]:
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
In [73]:
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 23 null values for age_test

Finally, we dropped unwanted columns and added a domain identification column for merging:

In [74]:
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
Out[74]:
study_id redcap_event_name score test_type school age_test domain
2 0101-2002-0101 year_2_complete_71_arm_1 90.0 PPVT 0101 80.0 Receptive Vocabulary
5 0101-2002-0101 year_5_complete_71_arm_1 101.0 ROWPVT 0101 113.0 Receptive Vocabulary
9 0101-2003-0102 initial_assessment_arm_1 55.0 PPVT 0101 44.0 Receptive Vocabulary
10 0101-2003-0102 year_1_complete_71_arm_1 80.0 PPVT 0101 54.0 Receptive Vocabulary
11 0101-2003-0102 year_2_complete_71_arm_1 101.0 PPVT 0101 68.0 Receptive Vocabulary
In [75]:
receptive.study_id.unique().shape
Out[75]:
(3108,)

Merge datasets

The four datasets were mereged into a single table. First, we concatenate the test scores data:

In [76]:
test_scores = pd.concat([articulation, expressive, receptive, language])

Then we perform a merge between the demographic data and the test scores data:

In [77]:
lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')
In [78]:
lsl_dr.tail()
Out[78]:
redcap_event_name academic_year academic_year_rv hl male _race prim_lang sib _mother_ed father_ed ... sex known_synd synd_or_disab race age_test domain school score test_name test_type
39154 year_9_complete_71_arm_1 2011-2012 2011.0 0.0 1.0 0.0 0.0 3.0 6.0 6.0 ... Male 0.0 0.0 0.0 162 Receptive Vocabulary 0102 84 NaN ROWPVT
39155 year_9_complete_71_arm_1 NaN NaN 0.0 NaN NaN NaN NaN NaN NaN ... NaN 0.0 0.0 NaN 203 Expressive Vocabulary 1147 95 NaN EVT
39156 year_9_complete_71_arm_1 NaN NaN 0.0 0.0 2.0 0.0 1.0 6.0 6.0 ... Female 0.0 0.0 2.0 119 Articulation 0624 102 NaN Goldman
39157 year_9_complete_71_arm_1 NaN NaN 0.0 0.0 2.0 0.0 1.0 6.0 6.0 ... Female 0.0 0.0 2.0 119 Expressive Vocabulary 0624 96 NaN EVT
39158 year_9_complete_71_arm_1 NaN NaN 0.0 0.0 2.0 0.0 1.0 6.0 6.0 ... Female 0.0 0.0 2.0 119 Receptive Vocabulary 0624 82 NaN PPVT

5 rows × 74 columns

In [79]:
lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()
Out[79]:
2013    6952
2012    6641
2014    6144
2011    5256
2010    4457
nan     3164
2009    2502
2015    1646
2008     827
2007     536
2006     345
2005     286
2004     172
2003      90
2002      47
2001      37
1998      16
1999      16
2000      12
1997       6
2201       5
2041       1
1995       1
Name: academic_year_start, dtype: int64
In [80]:
current_year_only = False

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']
In [81]:
expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');
In [82]:
expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');

Export dataset

In [83]:
if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')
In [163]:
lsl_dr.shape
Out[163]:
(39159, 79)
In [164]:
lsl_dr.study_id.unique().shape
Out[164]:
(5898,)
In [86]:
demographic.study_id.unique().shape
Out[86]:
(5898,)

Convert score to floating-point number

In [87]:
lsl_dr.score = lsl_dr.score.astype(float)
In [88]:
lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'
In [89]:
lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)
In [90]:
lsl_dr.domain.dropna().unique()
Out[90]:
array(['Expressive Vocabulary', 'Language', 'Articulation',
       'Receptive Vocabulary'], dtype=object)
In [91]:
lsl_dr.groupby('tech_class').prim_lang.mean().round(2)
Out[91]:
tech_class
Bilateral CI    0.45
Bilateral HA    0.58
Bimodal         0.50
Name: prim_lang, dtype: float64
In [92]:
lsl_dr['non_profound'] = lsl_dr.degree_hl<6
In [93]:
lsl_dr.groupby('tech_class').non_profound.mean().round(2)
Out[93]:
tech_class
Bilateral CI    0.08
Bilateral HA    0.87
Bimodal         0.31
Name: non_profound, dtype: float64
In [138]:
lsl_dr['age_test_year'] = -999
lsl_dr.loc[lsl_dr.age_test.notnull(), 'age_test_year'] = (lsl_dr.age_test/12).dropna().astype(int)
lsl_dr.loc[lsl_dr.age_test_year==-999, 'age_test_year'] = np.nan
In [139]:
f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
    plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
    plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
    ax.set_ylim(40, 120)
    ax.set_xticks(range(2,7))
    ax.set_title(dom)

PPVT

In [141]:
ppvt_only = lsl_dr[lsl_dr.test_type=='PPVT']
ppvt_only.age_year.hist()
Out[141]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ad41be0>
In [142]:
ppvt_345 = ppvt_only[ppvt_only.age_test_year.isin([3,4,5])]
In [143]:
ppvt_345.score.describe()
Out[143]:
count    2576.000000
mean       92.463509
std        20.127618
min        20.000000
25%        79.000000
50%        94.000000
75%       107.000000
max       153.000000
Name: score, dtype: float64
In [145]:
ppvt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]})
Out[145]:
score
min max median count_nonzero
age_test_year
3.0 36.0 153.0 95.0 873.0
4.0 20.0 149.0 94.0 936.0
5.0 20.0 142.0 91.0 767.0

EVT

In [146]:
lsl_dr.test_type.value_counts()
Out[146]:
expressive              6803
receptive               6744
Goldman                 5437
PPVT                    4445
EVT                     3881
EOWPVT                  2784
ROWPVT                  2346
Arizonia                 503
PPVT and ROWPVT          199
EOWPVT and EVT           149
Arizonia and Goldman      73
Name: test_type, dtype: int64
In [147]:
evt_only = lsl_dr[lsl_dr.test_type=='EVT']
evt_only.age_test_year.hist()
Out[147]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ad78470>
In [148]:
evt_345 = evt_only[evt_only.age_test_year.isin([3,4,5])]
In [149]:
evt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]})
Out[149]:
score
min max median count_nonzero
age_test_year
3.0 19.0 147.0 100.0 767.0
4.0 20.0 146.0 99.0 813.0
5.0 20.0 150.0 97.0 644.0

PLS

In [151]:
pls_only = (language[(language.test_name=='PLS')]
           .convert_objects(convert_numeric=True))
pls_only['age_year'] = np.floor(pls_only.age_test/12).astype(int)
pls_345 = pls_only[pls_only.age_year.isin([3,4,5])]
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  from ipykernel import kernelapp as app
In [152]:
(pls_345.assign(normal_limits=pls_345.score>=85).groupby(['age_year', 'test_type'])
             .agg({'score':[min, max, np.median, len], 
                    'normal_limits': np.mean}))
Out[152]:
score normal_limits
min max median len mean
age_year test_type
3 expressive 50.0 145.0 78.0 813.0 0.355474
receptive 50.0 140.0 80.0 813.0 0.404674
4 expressive 50.0 141.0 73.0 602.0 0.284053
receptive 50.0 136.0 77.0 606.0 0.381188
5 expressive 50.0 138.0 68.0 304.0 0.259868
receptive 50.0 129.0 73.0 306.0 0.290850

CELF

In [153]:
celf_only = (language_subtest[(language_subtest.test_name=='CELF-P2')]
           .convert_objects(convert_numeric=True))
celf_only['age_year'] = np.floor(celf_only.age_test/12).astype(int)
celf_46 = celf_only[celf_only.age_year.isin([4,6])]
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  from ipykernel import kernelapp as app
In [154]:
subtests = ['celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']
In [155]:
(celf_46.groupby('age_year')
             .agg({st:np.median for st in subtests})).T
Out[155]:
age_year 4 6
celfp_wct_ss 10.0 8.0
celfp_ev_ss 8.0 5.0
celfp_wcr_ss 10.0 10.0
celfp_wce_ss 9.0 7.0
celfp_ss_ss 8.0 5.0
celfp_ws_ss 6.0 4.0
celfp_fd_ss 8.0 4.0
celfp_rs_ss 7.0 4.0
celfp_bc_ss 9.0 4.5

Proportions in normal range

In [108]:
def calc_norm_range(dataset):
    return (dataset.groupby('study_id').score.mean() >= 85).mean()

Mean score of each domain

In [156]:
calc_norm_range(lsl_dr[(lsl_dr.domain=='Language') 
                       & (lsl_dr.test_type=='expressive')
                       & (lsl_dr.age_test_year.isin([3,4,5]))])
Out[156]:
0.40083217753120665
In [157]:
for year in range(2010, 2014):
    value = calc_norm_range(lsl_dr[(lsl_dr.domain=='Language') 
                       & (lsl_dr.test_type=='receptive') & (lsl_dr.academic_year_rv==year)
                                  & (lsl_dr.age_test_year.isin([3,4,5]))]).round(2)
    print('{}: {}'.format(year, value))
2010: 0.53
2011: 0.48
2012: 0.5
2013: 0.55
In [158]:
calc_norm_range(lsl_dr[(lsl_dr.domain=='Receptive Vocabulary')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])
Out[158]:
0.63506493506493511
In [159]:
calc_norm_range(lsl_dr[(lsl_dr.domain=='Expressive Vocabulary')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])
Out[159]:
0.64257555847568992
In [160]:
calc_norm_range(lsl_dr[(lsl_dr.domain=='Articulation')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])
Out[160]:
0.49158249158249157

Summary statistics

In [189]:
(lsl_dr.groupby('study_id').male.first().dropna()==0).mean()
Out[189]:
0.46830858384643242
In [197]:
(lsl_dr.groupby('study_id').race.first().dropna()==0).mean()
Out[197]:
0.54349040789718761
In [204]:
(lsl_dr.groupby('study_id').non_english.first().dropna()==False).sum()
Out[204]:
4404
In [208]:
lsl_dr.groupby('study_id').sib.first().dropna().count()
Out[208]:
5040
In [213]:
lsl_dr.groupby('study_id').onset_1.first().dropna().count()
Out[213]:
4161
In [215]:
lsl_dr.groupby('study_id').age_amp.first().dropna().median()
Out[215]:
8.0
In [218]:
lsl_dr.groupby('study_id').age_int.first().dropna().median()
Out[218]:
9.0
In [220]:
lsl_dr.groupby('study_id').age.first().dropna().count()
Out[220]:
5404
In [247]:
_unique = lsl_dr.dropna(subset=['age_disenrolled', 'age']).groupby('study_id').first()
(_unique.age_disenrolled - _unique.age).count()
Out[247]:
1868
In [254]:
synd_cause = lsl_dr.groupby('study_id').synd_cause.first().dropna()
synd_cause = synd_cause[synd_cause<3]
In [257]:
synd_cause.value_counts()/synd_cause.value_counts().sum()
Out[257]:
1.0    0.885766
0.0    0.091387
2.0    0.022847
Name: synd_cause, dtype: float64
In [262]:
etiology = lsl_dr.groupby('study_id').etiology.first().dropna()
etiology = etiology[etiology<3]
In [264]:
etiology.value_counts()/etiology.value_counts().sum()
Out[264]:
1.0    0.791393
0.0    0.163977
2.0    0.044630
Name: etiology, dtype: float64
In [267]:
lsl_dr['concerns'] = lsl_dr.etiology_2.replace({0:'none', 4:'none', 1:'mild', 2:'moderate', 3:'severe'})
In [270]:
lsl_dr.groupby('study_id').concerns.last().dropna().value_counts()
Out[270]:
none        3328
moderate     546
mild         436
severe       344
Name: concerns, dtype: int64

Plots of Demographic Data

In [271]:
plot_color = "#64AAE8"
In [272]:
def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index()
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()
In [273]:
unique_students = demographic.drop_duplicates('study_id')
In [274]:
unique_students.shape
Out[274]:
(5898, 68)
In [275]:
unique_students.age.describe()
Out[275]:
count    5381.000000
mean       29.302360
std        27.507899
min         0.000000
25%         8.000000
50%        24.000000
75%        40.000000
max       298.000000
Name: age, dtype: float64
In [276]:
plot_demo_data(unique_students.male, 
               ('Female', 'Male'), label_offset=20, color=plot_color)
In [277]:
plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)
In [278]:
unique_students.prim_lang.count()
Out[278]:
5419
In [279]:
plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)
In [280]:
unique_students.sib.count()
Out[280]:
5013
In [281]:
amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))
There are 4806 null values for age_amp
In [282]:
age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))
/Users/fonnescj/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
In [283]:
age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
plt.ylim(0,1000)
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))
In [284]:
age_amp_counts.sum()
Out[284]:
3806
In [285]:
unique_students.age_amp.max()
Out[285]:
173.0
In [286]:
(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')
Out[286]:
<matplotlib.text.Text at 0x11d496198>
In [287]:
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color, ylim=(0, 3000))
In [288]:
plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))
In [289]:
f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color)
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)
In [290]:
unique_students.tech_right.count()
Out[290]:
4742
In [291]:
unique_students.tech_left.count()
Out[291]:
4734
In [292]:
degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)
In [293]:
_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');
In [294]:
unique_students.degree_hl_as.count()
Out[294]:
4642
In [295]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)
In [296]:
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)
In [297]:
unique_students.type_hl_ad.count()
Out[297]:
4563
In [298]:
unique_students.type_hl_as.count()
Out[298]:
4660
In [299]:
type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)
In [300]:
demographic[demographic.study_id=='1147-2010-0064']
Out[300]:
redcap_event_name academic_year academic_year_rv hl male _race prim_lang sib _mother_ed father_ed ... bilateral_ci bilateral_ha bimodal tech implant_category age_diag sex known_synd synd_or_disab race
14665 initial_assessment_arm_1 2010-2011 2010.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... False True False 0 6 51.0 Female 0.0 0.0 0.0
14666 year_1_complete_71_arm_1 2011-2012 2011.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... False True False 0 6 51.0 Female 0.0 0.0 0.0
14667 year_2_complete_71_arm_1 2012-2013 2012.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... False True False 0 6 51.0 Female 0.0 0.0 0.0
14668 year_3_complete_71_arm_1 2013-2014 2013.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... False True False 0 6 51.0 Female 0.0 0.0 0.0

4 rows × 68 columns

In [301]:
receptive[receptive.study_id=='1147-2010-0064']
Out[301]:
study_id redcap_event_name score test_type school age_test domain
14665 1147-2010-0064 initial_assessment_arm_1 96.0 PPVT 1147 63.0 Receptive Vocabulary
14666 1147-2010-0064 year_1_complete_71_arm_1 91.0 PPVT 1147 73.0 Receptive Vocabulary
14667 1147-2010-0064 year_2_complete_71_arm_1 93.0 PPVT 1147 85.0 Receptive Vocabulary
In [302]:
lsl_dr[lsl_dr.study_id=='1147-2010-0064']
Out[302]:
redcap_event_name academic_year academic_year_rv hl male _race prim_lang sib _mother_ed father_ed ... school score test_name test_type academic_year_start tech_class age_year non_profound age_test_year concerns
5947 initial_assessment_arm_1 2010-2011 2010.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 1147 91.0 NaN EVT 2010 Bilateral HA 4.0 True 5.0 NaN
5948 initial_assessment_arm_1 2010-2011 2010.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 1147 96.0 NaN PPVT 2010 Bilateral HA 4.0 True 5.0 NaN
5949 initial_assessment_arm_1 2010-2011 2010.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 1147 101.0 PLS receptive 2010 Bilateral HA 4.0 True 4.0 NaN
5950 initial_assessment_arm_1 2010-2011 2010.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 1147 87.0 PLS expressive 2010 Bilateral HA 4.0 True 4.0 NaN
15880 year_1_complete_71_arm_1 2011-2012 2011.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 1147 86.0 NaN EVT 2011 Bilateral HA 4.0 True 6.0 NaN
15881 year_1_complete_71_arm_1 2011-2012 2011.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 1147 91.0 NaN PPVT 2011 Bilateral HA 4.0 True 6.0 NaN
23735 year_2_complete_71_arm_1 2012-2013 2012.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 1147 95.0 NaN EVT 2012 Bilateral HA 4.0 True 7.0 NaN
23736 year_2_complete_71_arm_1 2012-2013 2012.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... 1147 93.0 NaN PPVT 2012 Bilateral HA 4.0 True 7.0 NaN
32791 year_3_complete_71_arm_1 2013-2014 2013.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 ... NaN NaN NaN NaN 2013 Bilateral HA 4.0 True NaN NaN

9 rows × 80 columns

In [303]:
unique_students.type_hl_ad.count()
Out[303]:
4563
In [304]:
receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[304]:
(3108,)
In [305]:
demographic.study_id.unique().shape
Out[305]:
(5898,)
In [306]:
receptive.study_id.unique().shape
Out[306]:
(3108,)
In [307]:
lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape
Out[307]:
(3108,)
In [308]:
receptive_ids = receptive.study_id.unique()
In [309]:
demographic_ids = demographic.study_id.unique()
In [310]:
[s for s in receptive_ids if s not in demographic_ids]
Out[310]:
[]
In [311]:
def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    summary.index = summary.index.values.astype(int)
    return summary[['Sample Size','Mean','SD','Min','Max']]
In [312]:
receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary
Out[312]:
Sample Size Mean SD Min Max
2 424 93.759434 17.998914 40.0 144.0
3 1444 92.173823 19.124304 0.0 153.0
4 1582 90.716814 20.243070 0.0 149.0
5 1189 89.994113 18.050597 0.0 142.0
6 678 85.961652 16.160065 40.0 154.0
7 442 83.244344 16.113797 40.0 130.0
8 313 80.651757 17.500828 20.0 132.0
9 235 78.629787 17.568035 25.0 160.0
10 194 76.479381 17.488178 20.0 123.0
11 463 78.539957 18.944497 20.0 134.0
In [313]:
receptive_summary.describe()
Out[313]:
Sample Size Mean SD Min Max
count 10.000000 10.000000 10.000000 10.0000 10.000000
mean 696.400000 85.015106 17.919228 20.5000 142.100000
std 515.522863 6.356443 1.280871 16.4063 12.068784
min 194.000000 76.479381 16.113797 0.0000 123.000000
25% 340.750000 79.135280 17.491340 5.0000 132.500000
50% 452.500000 84.602998 17.783475 20.0000 143.000000
75% 1061.250000 90.536139 18.721022 36.2500 152.000000
max 1582.000000 93.759434 20.243070 40.0000 160.000000
In [314]:
receptive_summary['Sample Size'].sum()
Out[314]:
6964
In [315]:
receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
Out[315]:
<matplotlib.text.Text at 0x129f16518>
In [316]:
expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary
Out[316]:
Sample Size Mean SD Min Max
2 403 92.885856 21.971304 23.0 145.0
3 1389 93.531317 21.386317 19.0 147.0
4 1557 92.419396 21.762937 0.0 146.0
5 1160 91.680172 19.999878 0.0 150.0
6 676 87.002959 18.252711 20.0 146.0
7 441 84.133787 15.653573 38.0 131.0
8 304 83.976974 16.415685 34.0 122.0
9 221 82.036199 16.163330 36.0 145.0
10 188 82.085106 15.380841 40.0 122.0
11 464 84.771552 17.333085 18.0 146.0
In [317]:
expressive_summary['Sample Size'].sum()
Out[317]:
6803
In [318]:
expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 800)
else:
    plt.ylim(0, 1800)
In [319]:
articulation_summary = score_summary("Articulation")
articulation_summary
Out[319]:
Sample Size Mean SD Min Max
2 306 85.254902 14.944281 50.0 122.0
3 1215 83.656790 18.416468 40.0 126.0
4 1407 83.461265 20.866057 0.0 123.0
5 1089 82.844812 20.790949 39.0 120.0
6 638 79.460815 21.809311 39.0 115.0
7 415 78.101205 22.341971 3.0 112.0
8 268 79.313433 21.212468 40.0 107.0
9 195 81.497436 20.757901 39.0 109.0
10 149 81.516779 20.128507 40.0 107.0
11 326 81.733129 19.477465 39.0 105.0
In [320]:
articulation_summary['Sample Size'].sum()
Out[320]:
6008
In [321]:
sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);

Language scores

In [322]:
lsl_dr.domain.unique()
Out[322]:
array(['Expressive Vocabulary', 'Language', 'Articulation', nan,
       'Receptive Vocabulary'], dtype=object)
In [323]:
lsl_dr.test_type.unique()
Out[323]:
array(['EOWPVT', 'receptive', 'expressive', 'Goldman', nan, 'ROWPVT',
       'Arizonia', 'EVT', 'PPVT', 'Arizonia and Goldman', 'EOWPVT and EVT',
       'PPVT and ROWPVT'], dtype=object)
In [324]:
receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary
Out[324]:
Sample Size Mean SD Min Max
2 988 86.411943 22.293414 50.0 150.0
3 1408 84.969460 19.728716 50.0 144.0
4 1391 85.321352 19.453493 43.0 145.0
5 985 83.943147 18.823820 47.0 140.0
6 515 78.081553 17.745640 11.0 127.0
7 331 76.129909 18.941810 40.0 123.0
8 201 74.880597 19.700652 40.0 127.0
9 55 70.363636 21.026759 40.0 120.0
10 47 79.617021 20.802961 40.0 120.0
11 69 77.101449 21.432620 40.0 139.0
In [325]:
receptive_language_summary['Sample Size'].sum()
Out[325]:
5990
In [326]:
sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [327]:
expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary
Out[327]:
Sample Size Mean SD Min Max
2 981 88.450561 18.587983 50.0 150.0
3 1410 82.344681 17.569380 20.0 147.0
4 1382 80.683792 19.533977 45.0 141.0
5 1006 78.666998 20.106123 45.0 144.0
6 536 71.820896 19.421195 6.0 140.0
7 354 67.426554 21.096070 40.0 124.0
8 211 68.312796 21.588506 40.0 119.0
9 55 65.163636 21.369556 40.0 108.0
10 47 77.574468 23.968952 40.0 119.0
11 68 73.882353 22.531258 40.0 132.0
In [328]:
expressive_language_summary['Sample Size'].sum()
Out[328]:
6050
In [329]:
sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);
In [330]:
(unique_students.age/12.).describe()
Out[330]:
count    5381.000000
mean        2.441863
std         2.292325
min         0.000000
25%         0.666667
50%         2.000000
75%         3.333333
max        24.833333
Name: age, dtype: float64
In [331]:
def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})
In [332]:
audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())
In [333]:
plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')
Out[333]:
<matplotlib.text.Text at 0x11bb3a198>
In [334]:
slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())
In [335]:
plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')
Out[335]:
<matplotlib.text.Text at 0x11a9d55f8>
In [336]:
sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())
In [337]:
plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')
Out[337]:
<matplotlib.text.Text at 0x11a95cc50>
In [338]:
lsl_dr.degree_hl.dropna().value_counts()
Out[338]:
6.0    17779
4.0     4722
3.0     4595
5.0     4336
2.0     1788
0.0     1273
1.0      307
Name: degree_hl, dtype: int64
In [339]:
ax = lsl_dr.degree_hl.hist(bins=7)
In [340]:
diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)
Out[340]:
<matplotlib.axes._subplots.AxesSubplot at 0x12056a5f8>
In [341]:
(lsl_dr.age_int<6).mean()
Out[341]:
0.20646594652570291
In [342]:
(lsl_dr.age<6).mean()
Out[342]:
0.13450292397660818

Counts by year

In [343]:
unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');