#!/usr/bin/env python
# coding: utf-8

# In[1]:


# Import modules and set options
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


# Connect to database to import data for the three test domains and demographic information:

# In[2]:


from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)


# In[3]:


metadata = lsl_dr_project.export_metadata()


# In[4]:


# for i,j in zip(lsl_dr_project.field_names, 
#                lsl_dr_project.field_labels):
#     print('{0}: \t{1}'.format(i,j))


# Import each database from REDCap:

# In[5]:


articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None,
                                                                                                'na_values':[999, 9999]})


# In[6]:


records = lsl_dr_project.export_records(fields=articulation_fields)


# In[7]:


print(records[0]['study_id'])


# In[8]:


expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})


# In[9]:


receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})


# In[10]:


language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf',
                   'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',
                   'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',
                   'celfp_wct_ss']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})


# In[11]:


demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year', 'academic_year_rv',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})


# In[12]:


demographic_raw.academic_year_rv.value_counts()


# In[13]:


demographic_raw[demographic_raw.study_id=='1147-2010-0064']


# ## Attendance information

# Several fields in the demographic data have missing values.

# In[14]:


demographic_raw.head()


# We can fill missing values forward from previous observation (by `study_id`)

# In[15]:


demographic = demographic_raw.sort(columns='redcap_event_name').groupby('study_id').transform(
                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()
demographic["study_id"] = demographic_raw.sort(columns='redcap_event_name').study_id


# Random check to make sure this worked

# In[16]:


demographic[demographic.study_id=='1147-2010-0064']


# Demographic data without missing values:

# In[17]:


demographic.head()


# ## Cleaning languge dataset
# 
# 5 language measures:
# 
# - 3 versions of CELF
# - PLS
#     - pls_ac_rs: 	PLS: Auditory Comprehension Raw Score
#     - pls_ac_ss: 	PLS: Auditory Comprehension Standard Score
#     - pls_ec_rs: 	PLS: Expressive Communication Raw Score
#     - pls_ec_ss: 	PLS: Expressive Communication Standard Score
#     - pls_tl_rs: 	PLS: Total Language Score Standard Score Total
#     - pls_tl_ss: 	PLS: Total Language Score Standard Score
# - OWLS
#     - age_test_owls: 	Age at time of testing (OWLS)
#     - owls_lc_rs: 	OWLS: Listening Comprehension Raw Score
#     - owls_lc_ss: 	OWLS: Listening Comprehension Standard Score
#     - owls_oe_rs: 	OWLS: Oral Expression Raw Score
#     - owls_oe_ss: 	OWLS: Oral Expression Standard Score
#     - owls_oc_sss: 	OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
#     - owls_oc_ss: 	OWLS: Oral Composite Standard Score
#     - owls_wes_trs: 	OWLS: Written Expression Scale Total Raw Score
#     - owls_wes_as: 	OWLS: Written Expression Scale Ability Score
#     - owls_wes_ss: 	OWLS: Written Expression Scale Standard Score
#     - owsl_lc: 	OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
#     - owls_lcss: 	OWLS: Language Composite Standard Score

# In[18]:


# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))


# A `school` variable was added, which is the first four columns of the `study_id`:

# In[19]:


language["school"] = language.study_id.str.slice(0,4)


# In[20]:


language_subtest = language[["study_id", "redcap_event_name", "score", "test_type", 
                             "test_name", "school", "age_test", 
                             'celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']]


# In[21]:


language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()


# ## Cleaning articulation dataset
# 
# We converted the articulation dataset into a "long" format:

# In[22]:


# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]


# A `school` variable was added, which is the first four columns of the `study_id`:

# In[23]:


articulation["school"] = articulation.study_id.str.slice(0,4)


# The age was taken to be the Arizonia age if there are both test types:

# In[24]:


articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())


# Finally, we dropped unwanted columns and added a domain identification column for merging:

# In[25]:


articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()


# ## Cleaning demographic dataset
# 
# We excluded unwanted columns and rows for which age, gender or race were missing:

# In[26]:


# Retain only subset of columns
#demographic = demographic[demographic.gender.notnull()]
demographic = demographic.rename(columns={'gender':'male'})


# Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):

# In[27]:


demographic["non_english"] = None
demographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0
print(demographic.non_english.value_counts())
print("There are {0} null values for non_english".format(sum(demographic.non_english.isnull())))


# Mother's education (`mother_ed`) and father's education (`father_ed`) were both recoded to: 
# 
# * 0=no high school diploma
# * 1=high school
# * 2=undergraduate
# * 3=graduate
# 
# Category 6 (unknown) was recoded as missing.

# In[28]:


demographic = demographic.rename(columns={"mother_ed":"_mother_ed"})
demographic["mother_ed"] = demographic._mother_ed.copy()
demographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0
demographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1
demographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2
demographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3
demographic.loc[demographic._mother_ed==6, 'mother_ed'] = None
print("_mother_ed:")
print(demographic._mother_ed.value_counts())
print("mother_ed:")
print(demographic.mother_ed.value_counts())
print("\nThere are {0} null values for mother_ed".format(sum(demographic.mother_ed.isnull())))


# Secondary diagnosis

# In[29]:


demographic.shape


# In[30]:


demographic['secondary_diagnosis'] = demographic.etiology==0
# Suspected or unknown treated as missing
demographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None


# In[31]:


demographic.secondary_diagnosis.value_counts()


# In[32]:


demographic.secondary_diagnosis.mean()


# Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks.

# In[33]:


demographic['premature_weeks'] = demographic.premature_age.copy()
demographic.loc[demographic.premature_age==9, 'premature_weeks'] = None
demographic.premature_weeks = abs(demographic.premature_weeks-8)*2
print("There are {0} null values for premature_weeks".format(sum(demographic.premature_weeks.isnull())))


# In[34]:


demographic.premature_weeks.value_counts()


# Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):

# In[35]:


demographic.tech_ad.value_counts()


# In[36]:


tech_cats = ["None", "OAD", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = 4
demographic.loc[demographic.tech_ad==7, 'tech_right'] = 0
demographic.loc[demographic.tech_ad==3, 'tech_right'] = 1
demographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2
demographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3
demographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None

demographic["tech_left"] = 4
demographic.loc[demographic.tech_as==7, 'tech_left'] = 0
demographic.loc[demographic.tech_as==3, 'tech_left'] = 1
demographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2
demographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3
demographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None


# In[37]:


demographic.tech_left.value_counts()


# In[38]:


demographic.tech_right.value_counts()


# Substitute valid missing values for hearing loss:

# In[39]:


demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None
demographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None


# Create `degree_hl`, which is the maximum level of hearing loss in either ear:

# In[40]:


demographic["degree_hl"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)


# Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant): 
# 
# * 0=none
# * 1=one ear
# * 2=both ears.

# In[41]:


demographic.columns


# In[42]:


demographic["oad"] = 0
demographic.oad = demographic.oad.astype(object)
demographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1
demographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None
print("oad:")
print(demographic.drop_duplicates(subset='study_id').oad.value_counts())
print("There are {0} null values for OAD".format(sum(demographic.oad.isnull())))

demographic["hearing_aid"] = 0
demographic.hearing_aid = demographic.hearing_aid.astype(object)
demographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1
demographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None
print("\nhearing_aid:")
print(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())
print("There are {0} null values for hearing_aid".format(sum(demographic.hearing_aid.isnull())))

demographic["cochlear"] = 0
demographic.cochlear = demographic.cochlear.astype(object)
demographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1
demographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2
demographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None
print("\ncochlear:")
print(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())
print("There are {0} null values for cochlear".format(sum(demographic.cochlear.isnull())))
print(len(demographic))


# Identify bilateral and bimodal individuals:

# In[43]:


demographic["unilateral_ci"] = demographic.cochlear==1
demographic["bilateral_ci"] = demographic.cochlear==2
demographic["bilateral_ha"] = demographic.hearing_aid==2
demographic["bimodal"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)


# In[44]:


demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()


# In[45]:


demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', 
                                               'bilateral_ha',
                                               'bimodal']].sum()


# Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)

# In[46]:


demographic['tech'] = 0
demographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1
demographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2
print("There are {0} null values for tech".format(sum(demographic.tech.isnull())))


# In[47]:


demographic["implant_category"] = None
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 0
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 1
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 2
demographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), 
                'implant_category'] = 3
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), 
                'implant_category'] = 4
demographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), 
                'implant_category'] = 5
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), 
                'implant_category'] = 6
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), 
                'implant_category'] = 7
demographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), 
                'implant_category'] = 8
demographic.implant_category.value_counts()


# **Age when hearing loss diagnosed** Data are entered inconsistently here, so we have to go in and replace non-numeric values.

# In[48]:


demographic.onset_1.unique()


# In[49]:


# Don't need this anymore
# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, 
#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)
demographic['age_diag'] = demographic.onset_1


# Number of null values for `age_diag`

# In[50]:


demographic.age_diag.isnull().sum()


# In[51]:


demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})


# In[52]:


import seaborn as sb

unique_students = demographic.dropna(subset=['sex']).groupby('study_id').first()

# ag = sb.factorplot("sex", data=unique_students, 
#               palette="PuBuGn_d", kind='count')
# ag.set_xticklabels(['Female ({})'.format((unique_students.male==0).sum()), 
#                     'Male ({})'.format((unique_students.male==1).sum())])
# ag.set_xlabels('')


# In[162]:


unique_students.shape


# Child has another diagnosed disability

# In[53]:


demographic['known_synd'] = (demographic.synd_cause == 0)
# Unknown or suspected
demographic.loc[demographic.synd_cause > 1, 'known_synd'] = None


# In[54]:


# If either known syndrome or secondary diagnosis
demographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)


# Missing sibling counts were properly encoded as `None` (missing).

# In[55]:


demographic.loc[demographic.sib==4, 'sib'] = None


# We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to "other", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing.

# In[56]:


races = ["Caucasian", "Black or African American", "Hispanic or Latino", "Asian", "Other"]
demographic = demographic.rename(columns={"race":"_race"})
demographic["race"] = demographic._race.copy()
demographic.loc[demographic.race==7, 'race'] = None
demographic.loc[demographic.race>3, 'race'] = 4
print("_race:")
print(demographic._race.value_counts())
print("race:")
print(demographic.race.value_counts())
print("There are {0} null values for race".format(sum(demographic.race.isnull())))
# Replace with recoded column


# Recode implant technology variables

# In[57]:


tech_cats = ["None", "Baha", "Hearing aid", "Cochlear", "Other"]

demographic["tech_right"] = demographic.tech_ad.copy()
demographic.loc[demographic.tech_right==6, 'tech_right'] = 0
demographic.loc[demographic.tech_right==4, 'tech_right'] = 1
demographic.loc[demographic.tech_right==5, 'tech_right'] = 1
demographic.loc[demographic.tech_right==3, 'tech_right'] = 2
demographic.loc[demographic.tech_right==7, 'tech_right'] = 3
demographic.loc[demographic.tech_right==8, 'tech_right'] = 3
demographic.loc[demographic.tech_right==9, 'tech_right'] = 4
demographic.tech_right = np.abs(demographic.tech_right - 3)

demographic["tech_left"] = demographic.tech_as.copy()
demographic.loc[demographic.tech_left==6, 'tech_left'] = 0
demographic.loc[demographic.tech_left==4, 'tech_left'] = 1
demographic.loc[demographic.tech_left==5, 'tech_left'] = 1
demographic.loc[demographic.tech_left==3, 'tech_left'] = 2
demographic.loc[demographic.tech_left==7, 'tech_left'] = 3
demographic.loc[demographic.tech_left==8, 'tech_left'] = 3
demographic.loc[demographic.tech_left==9, 'tech_left'] = 4
demographic.tech_left = np.abs(demographic.tech_left - 3)


# In[58]:


# Don't need this anymore
# demographic['age_amp'] = demographic.age_amp.replace({'22 mo': 22, 'unknown': np.nan, 'none': np.nan, 
#                              'n/a unilateral loss': np.nan, 'not amplified yet': np.nan,
#                              'not amplified': np.nan, 'n/a': np.nan, '4 months-6 months': 5,
#                              '6 weeks': 0.5, '13-18': 15.5, '0-3': 1.5, '24-36': 30}).astype(float)


# In[59]:


demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012',
                                   '0000-0000': np.nan}).str.replace('*', '-').unique()


# In[60]:


demographic['academic_year'] = demographic.academic_year.replace(
                         {'12013-2014': '2013-2014', '2010 - 20111': '2010 - 2011', 
                          '2020-2011': '2010-2011', '2012-20013': '2012-2013', 
                          '642014-2015': '2014-2015', '20114-2015': '2014-2015',
                          '2011-012': '2011-2012', '2014-2105': '2014-2015', '2005-2004': '2004-2005',
                          '2014-205': '2014-2015', '2017-2015': '2014-2015', '2014-1015': '2014-2015',
                          '2015-2015': '2014-2015', '2009-2011': '2009-2010',
                                   '0000-0000': np.nan}).str.replace('*', '-')


# Removed entries that don't contain dashes

# In[61]:


demographic.loc[~(demographic.academic_year.notnull() & demographic.academic_year.str.contains('-')), 
                'academic_year'] = np.nan


# In[62]:


demographic.loc[demographic.academic_year.notnull(), 'academic_year'] = demographic.academic_year[demographic.academic_year.notnull()].apply(lambda x: ''.join(x.split()))


# In[63]:


demographic.age_amp.hist()


# ## Cleaning expressive vocabulary dataset
# 
# We converted the expressive vocabulary dataset to "long" format:

# In[64]:


# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]


# In[65]:


expressive.test_type.value_counts()


# A `school` variable was added, which is the first four columns of the `study_id`:

# In[66]:


expressive["school"] = expressive.study_id.str.slice(0,4)


# The age was taken to be the EOWPVT age if there are both test types:

# In[67]:


expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]


# Finally, we dropped unwanted columns and added a domain identification column for merging:

# In[68]:


expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()


# ## Cleaning receptive vocabulary dataset

# We converted the receptive vocabulary data table to "long" format:

# In[69]:


receptive.columns


# In[70]:


# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]


# A `school` variable was added, which is the first four columns of the `study_id`:

# In[71]:


receptive["school"] = receptive.study_id.str.slice(0,4)


# The age was taken to be the PPVT age if there are both test types:

# In[72]:


receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]


# In[73]:


print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))


# Finally, we dropped unwanted columns and added a domain identification column for merging:

# In[74]:


receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()


# In[75]:


receptive.study_id.unique().shape


# ## Merge datasets
# 
# The four datasets were mereged into a single table. First, we concatenate the test scores data:

# In[76]:


test_scores = pd.concat([articulation, expressive, receptive, language])


# Then we perform a merge between the demographic data and the test scores data:

# In[77]:


lsl_dr = pd.merge(demographic, test_scores, on=["study_id", "redcap_event_name"], how='left')


# In[78]:


lsl_dr.tail()


# In[79]:


lsl_dr['academic_year_start'] = lsl_dr.academic_year.apply(lambda x: str(x).strip()[:4])
lsl_dr.academic_year_start.value_counts()


# In[80]:


current_year_only = False

if current_year_only:
    lsl_dr = lsl_dr[lsl_dr.academic_year_start=='2013']


# In[81]:


expressive_scores = lsl_dr[(lsl_dr.domain=='Expressive Vocabulary') & (lsl_dr.score>=20)].score
expressive_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');


# In[82]:


expressive_lang_scores = lsl_dr[(lsl_dr.domain=='Language') 
                               & (lsl_dr.test_type=='expressive')].score
expressive_lang_scores.hist(bins=25)
plt.xlabel('Standard scores'); plt.ylabel('Frequency');


# Export dataset

# In[83]:


if current_year_only:

    lsl_dr.to_csv('lsl_dr_current_year.csv')

else:
    lsl_dr.to_csv('lsl_dr.csv')


# In[163]:


lsl_dr.shape


# In[164]:


lsl_dr.study_id.unique().shape


# In[86]:


demographic.study_id.unique().shape


# Convert score to floating-point number

# In[87]:


lsl_dr.score = lsl_dr.score.astype(float)


# In[88]:


lsl_dr['tech_class'] = 'Bimodal'
lsl_dr.loc[lsl_dr.bilateral_ci==True, 'tech_class'] = 'Bilateral CI'
lsl_dr.loc[lsl_dr.bilateral_ha==True, 'tech_class'] = 'Bilateral HA'


# In[89]:


lsl_dr['age_year'] = np.floor(lsl_dr.age/12.)


# In[90]:


lsl_dr.domain.dropna().unique()


# In[91]:


lsl_dr.groupby('tech_class').prim_lang.mean().round(2)


# In[92]:


lsl_dr['non_profound'] = lsl_dr.degree_hl<6


# In[93]:


lsl_dr.groupby('tech_class').non_profound.mean().round(2)


# In[138]:


lsl_dr['age_test_year'] = -999
lsl_dr.loc[lsl_dr.age_test.notnull(), 'age_test_year'] = (lsl_dr.age_test/12).dropna().astype(int)
lsl_dr.loc[lsl_dr.age_test_year==-999, 'age_test_year'] = np.nan


# In[139]:


f, axes = plt.subplots(2, 2, figsize=(14,10))
for ax, dom in zip(np.ravel(axes), lsl_dr.domain.dropna().unique()):
    plot_data = lsl_dr[lsl_dr.domain==dom].pivot_table(index='age_year', columns='tech_class', values='score', aggfunc='mean')
    plot_data[(plot_data.index>1) & (plot_data.index<7)].plot(ax=ax)
    ax.set_ylim(40, 120)
    ax.set_xticks(range(2,7))
    ax.set_title(dom)


# ## PPVT

# In[141]:


ppvt_only = lsl_dr[lsl_dr.test_type=='PPVT']
ppvt_only.age_year.hist()


# In[142]:


ppvt_345 = ppvt_only[ppvt_only.age_test_year.isin([3,4,5])]


# In[143]:


ppvt_345.score.describe()


# In[145]:


ppvt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]})


# ## EVT

# In[146]:


lsl_dr.test_type.value_counts()


# In[147]:


evt_only = lsl_dr[lsl_dr.test_type=='EVT']
evt_only.age_test_year.hist()


# In[148]:


evt_345 = evt_only[evt_only.age_test_year.isin([3,4,5])]


# In[149]:


evt_345.groupby('age_test_year').agg({'score':[min, max, np.median, np.count_nonzero]})


# ## PLS

# In[151]:


pls_only = (language[(language.test_name=='PLS')]
           .convert_objects(convert_numeric=True))
pls_only['age_year'] = np.floor(pls_only.age_test/12).astype(int)
pls_345 = pls_only[pls_only.age_year.isin([3,4,5])]


# In[152]:


(pls_345.assign(normal_limits=pls_345.score>=85).groupby(['age_year', 'test_type'])
             .agg({'score':[min, max, np.median, len], 
                    'normal_limits': np.mean}))


# ## CELF

# In[153]:


celf_only = (language_subtest[(language_subtest.test_name=='CELF-P2')]
           .convert_objects(convert_numeric=True))
celf_only['age_year'] = np.floor(celf_only.age_test/12).astype(int)
celf_46 = celf_only[celf_only.age_year.isin([4,6])]


# In[154]:


subtests = ['celfp_ss_ss', 'celfp_ws_ss', 
                             'celfp_ev_ss', 'celfp_fd_ss',
                             'celfp_rs_ss', 'celfp_bc_ss', 
                             'celfp_wcr_ss', 'celfp_wce_ss',
                             'celfp_wct_ss']


# In[155]:


(celf_46.groupby('age_year')
             .agg({st:np.median for st in subtests})).T


# ### Proportions in normal range

# In[108]:


def calc_norm_range(dataset):
    return (dataset.groupby('study_id').score.mean() >= 85).mean()


# Mean score of each domain

# In[156]:


calc_norm_range(lsl_dr[(lsl_dr.domain=='Language') 
                       & (lsl_dr.test_type=='expressive')
                       & (lsl_dr.age_test_year.isin([3,4,5]))])


# In[157]:


for year in range(2010, 2014):
    value = calc_norm_range(lsl_dr[(lsl_dr.domain=='Language') 
                       & (lsl_dr.test_type=='receptive') & (lsl_dr.academic_year_rv==year)
                                  & (lsl_dr.age_test_year.isin([3,4,5]))]).round(2)
    print('{}: {}'.format(year, value))


# In[158]:


calc_norm_range(lsl_dr[(lsl_dr.domain=='Receptive Vocabulary')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])


# In[159]:


calc_norm_range(lsl_dr[(lsl_dr.domain=='Expressive Vocabulary')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])


# In[160]:


calc_norm_range(lsl_dr[(lsl_dr.domain=='Articulation')
                      & (lsl_dr.age_test_year.isin([3,4,5]))])


# Summary statistics

# In[189]:


(lsl_dr.groupby('study_id').male.first().dropna()==0).mean()


# In[197]:


(lsl_dr.groupby('study_id').race.first().dropna()==0).mean()


# In[204]:


(lsl_dr.groupby('study_id').non_english.first().dropna()==False).sum()


# In[208]:


lsl_dr.groupby('study_id').sib.first().dropna().count()


# In[213]:


lsl_dr.groupby('study_id').onset_1.first().dropna().count()


# In[215]:


lsl_dr.groupby('study_id').age_amp.first().dropna().median()


# In[218]:


lsl_dr.groupby('study_id').age_int.first().dropna().median()


# In[220]:


lsl_dr.groupby('study_id').age.first().dropna().count()


# In[247]:


_unique = lsl_dr.dropna(subset=['age_disenrolled', 'age']).groupby('study_id').first()
(_unique.age_disenrolled - _unique.age).count()


# In[254]:


synd_cause = lsl_dr.groupby('study_id').synd_cause.first().dropna()
synd_cause = synd_cause[synd_cause<3]


# In[257]:


synd_cause.value_counts()/synd_cause.value_counts().sum()


# In[262]:


etiology = lsl_dr.groupby('study_id').etiology.first().dropna()
etiology = etiology[etiology<3]


# In[264]:


etiology.value_counts()/etiology.value_counts().sum()


# In[267]:


lsl_dr['concerns'] = lsl_dr.etiology_2.replace({0:'none', 4:'none', 1:'mild', 2:'moderate', 3:'severe'})


# In[270]:


lsl_dr.groupby('study_id').concerns.last().dropna().value_counts()


# ### Plots of Demographic Data

# In[271]:


plot_color = "#64AAE8"


# In[272]:


def plot_demo_data(series, labels=None, color=plot_color, rot=0, label_offset=20, xlim=None, 
                   ylim=None, title=None, **kwargs):
    ax = kwargs.get('ax')
    if ax is None:
        f, ax = plt.subplots()
    counts = series.value_counts().sort_index()
    counts.plot(kind='bar', grid=False, rot=rot, color=color, **kwargs)
    if xlim is None:
        ax.set_xlim(-0.5, len(counts)-0.5)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_ylabel('Count')
    if labels is not None:
        ax.set_xticklabels(labels)
    if title:
        ax.set_title(title)
    for i,x in enumerate(counts):
        ax.annotate('%i' % x, (i, x + label_offset))
        
#     plt.gca().tight_layout()


# In[273]:


unique_students = demographic.drop_duplicates('study_id')


# In[274]:


unique_students.shape


# In[275]:


unique_students.age.describe()


# In[276]:


plot_demo_data(unique_students.male, 
               ('Female', 'Male'), label_offset=20, color=plot_color)


# In[277]:


plot_demo_data(unique_students.prim_lang, 
               ('English', 'Spanish', 'Chinese', 'French', 'German', 'Tagalong', 'Other'), 
               rot=70, color=plot_color)


# In[278]:


unique_students.prim_lang.count()


# In[279]:


plot_demo_data(unique_students.sib, ('1', '2', '3', '4+'), 
               color=plot_color)


# In[280]:


unique_students.sib.count()


# In[281]:


amp_ages = ["Birth-3 months", "4 months - 6 months", "7 months - 9 months", "10 months- 12 months", 
"13 months - 18 months", "19 months - 24 months", "2 years 1 day - 3 years", 
"3 years 1 day - 4 years", "4 years 1 day - 5 years", "5 years 1 day - 6 years", "6 years"]

demographic.loc[demographic.age_amp==11, 'age_amp'] = None
print("There are {0} null values for age_amp".format(sum(demographic.age_amp.isnull())))


# In[282]:


age_classes = pd.Series(pd.cut(unique_students.sort('age_amp').age_amp.dropna(), [-1,3,6,9,12,18,24,36,48,60,72,1000],
                               labels=amp_ages))


# In[283]:


age_amp_counts = age_classes.value_counts()[amp_ages]
age_amp_counts.plot(kind='bar', grid=False, rot=90, color=plot_color)
plt.xlim(-0.5, len(age_amp_counts)-0.5)
plt.ylabel('Count')
plt.ylim(0,1000)
for i,x in enumerate(age_amp_counts):
    plt.annotate('%i' % x, (i, x + 10))


# In[284]:


age_amp_counts.sum()


# In[285]:


unique_students.age_amp.max()


# In[286]:


(unique_students.age_amp/12.).hist(bins=16, grid=False, color=plot_color)
plt.ylabel('Count')
plt.xlabel('Age at amplification')


# In[287]:


plot_demo_data(unique_students.tech_left, tech_cats, rot=90, color=plot_color, ylim=(0, 3000))


# In[288]:


plot_demo_data(unique_students.tech_right, tech_cats, rot=90, color=plot_color, ylim=(0, 2500))


# In[289]:


f, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.tech_right, [""]*len(tech_cats), rot=90, 
               ax=axes[0], title='Right ear', color=plot_color)
plot_demo_data(unique_students.tech_left, tech_cats, rot=90, 
               ax=axes[1], title='Left ear', color=plot_color)


# In[290]:


unique_students.tech_right.count()


# In[291]:


unique_students.tech_left.count()


# In[292]:


degree_hl_cats = 'Normal (0-14)', 'Slight (15-25)', 'Mild (26-40)', \
'Moderate (41-55)', 'Moderately Severe (56-70)', 'Severe (71-90)', 'Profound (90+)'
plot_demo_data(unique_students.degree_hl_ad, degree_hl_cats, rot=90, color=plot_color)


# In[293]:


_, axes = plt.subplots(2, 1)
plot_demo_data(unique_students.degree_hl_ad, [""]*7, rot=90, 
               color=plot_color, ax=axes[0], title='Right ear')
plot_demo_data(unique_students.degree_hl_as, degree_hl_cats, rot=90, 
               color=plot_color, ylim=(0,2000), ax=axes[1], title='Left ear');


# In[294]:


unique_students.degree_hl_as.count()


# In[295]:


type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
plot_demo_data(unique_students.type_hl_ad, type_hl_cats, rot=90, color=plot_color)


# In[296]:


plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, color=plot_color)


# In[297]:


unique_students.type_hl_ad.count()


# In[298]:


unique_students.type_hl_as.count()


# In[299]:


type_hl_cats = 'Sensorineural', 'Conductive', 'Mixed', 'Neural', 'Normal', 'Unknown'
f, axes = plt.subplots(2,1)
plot_demo_data(unique_students.type_hl_ad, [""]*len(type_hl_cats), rot=90, 
               title='Right ear', ax=axes[0], color=plot_color)
plot_demo_data(unique_students.type_hl_as, type_hl_cats, rot=90, 
               title='Left ear', ax=axes[1], color=plot_color)


# In[300]:


demographic[demographic.study_id=='1147-2010-0064']


# In[301]:


receptive[receptive.study_id=='1147-2010-0064']


# In[302]:


lsl_dr[lsl_dr.study_id=='1147-2010-0064']


# In[303]:


unique_students.type_hl_ad.count()


# In[304]:


receptive[receptive.domain=="Receptive Vocabulary"].study_id.unique().shape


# In[305]:


demographic.study_id.unique().shape


# In[306]:


receptive.study_id.unique().shape


# In[307]:


lsl_dr[lsl_dr.domain=="Receptive Vocabulary"].study_id.unique().shape


# In[308]:


receptive_ids = receptive.study_id.unique()


# In[309]:


demographic_ids = demographic.study_id.unique()


# In[310]:


[s for s in receptive_ids if s not in demographic_ids]


# In[311]:


def score_summary(domain, test_type=None):
    subset = lsl_dr[lsl_dr.domain==domain].copy()
    if test_type is not None:
        subset = subset[subset.test_type==test_type]
    subset['age_test'] = (subset.age_test/12).dropna().astype(int)
    subset.loc[subset.age_test > 11, 'age_test'] = 11
    subset = subset[subset.age_test>1]
    byage = subset.groupby('age_test')
    n = byage.study_id.count()
    mean = byage.score.mean()
    sd = byage.score.std()
    min = byage.score.min()
    max = byage.score.max()
    summary = pd.DataFrame({'Sample Size':n, 'Mean':mean, 
    'SD':sd, 'Min':min, 'Max':max})
    summary.index = summary.index.values.astype(int)
    return summary[['Sample Size','Mean','SD','Min','Max']]


# In[312]:


receptive_summary = score_summary("Receptive Vocabulary")
receptive_summary


# In[313]:


receptive_summary.describe()


# In[314]:


receptive_summary['Sample Size'].sum()


# In[315]:


receptive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')


# In[316]:


expressive_summary = score_summary("Expressive Vocabulary")
expressive_summary


# In[317]:


expressive_summary['Sample Size'].sum()


# In[318]:


expressive_summary["Sample Size"].plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.xlim(-0.5, 9.5)
if current_year_only:
    plt.ylim(0, 800)
else:
    plt.ylim(0, 1800)


# In[319]:


articulation_summary = score_summary("Articulation")
articulation_summary


# In[320]:


articulation_summary['Sample Size'].sum()


# In[321]:


sample_size = articulation_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(articulation_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);


# Language scores

# In[322]:


lsl_dr.domain.unique()


# In[323]:


lsl_dr.test_type.unique()


# In[324]:


receptive_language_summary = score_summary("Language", "receptive")
receptive_language_summary


# In[325]:


receptive_language_summary['Sample Size'].sum()


# In[326]:


sample_size = receptive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(receptive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);


# In[327]:


expressive_language_summary = score_summary("Language", "expressive")
expressive_language_summary


# In[328]:


expressive_language_summary['Sample Size'].sum()


# In[329]:


sample_size = expressive_language_summary["Sample Size"]
sample_size.plot(kind='bar', grid=False, color=plot_color)
for i,x in enumerate(expressive_language_summary["Sample Size"]):
        plt.annotate('%i' % x, (i, x+10), va="bottom", ha="center")
plt.ylabel('Count')
plt.xlabel('Age')
plt.ylim(0, sample_size.max()+50);plt.xlim(-0.5, 9.5);


# In[330]:


(unique_students.age/12.).describe()


# In[331]:


def calc_difference(x, col='a_fo', jitter=True):
    if (len(x)<2 or x[col].isnull().sum() or x['funct_out_age'].isnull().sum()):
        return None
    diff = x[col][x.funct_out_age.argmax()] - x[col][x.funct_out_age.argmin()]
    if jitter:
        diff += np.random.normal(scale=0.05)
    if (x.funct_out_age.max() - x.funct_out_age.min()) > 1000:
        print(x['funct_out_age'])
    return({'difference':diff, 'months': x.funct_out_age.max() - x.funct_out_age.min()})


# In[332]:


audition = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference).dropna().values.tolist())


# In[333]:


plt.figure(figsize=(10,6))
plt.scatter(audition.months, audition.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Audition')


# In[334]:


slc = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='slc_fo').dropna().values.tolist())


# In[335]:


plt.figure(figsize=(10,6))
plt.scatter(slc.months, slc.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (comprehension)')


# In[336]:


sle = pd.DataFrame(demographic.groupby('study_id').apply(calc_difference, col='sle_fo').dropna().values.tolist())


# In[337]:


plt.figure(figsize=(10,6))
plt.scatter(sle.months, sle.difference, alpha=0.5)
plt.xlabel('Months between earliest and latest rating'); plt.ylabel('Progress (levels)')
plt.title('Spoken language (expression)')


# In[338]:


lsl_dr.degree_hl.dropna().value_counts()


# In[339]:


ax = lsl_dr.degree_hl.hist(bins=7)


# In[340]:


diff = (lsl_dr['age'] - lsl_dr['age_int'])
diff[diff>0].hist(bins=50)


# In[341]:


(lsl_dr.age_int<6).mean()


# In[342]:


(lsl_dr.age<6).mean()


# Counts by year

# In[343]:


unique_students = lsl_dr.groupby('study_id').first()
unique_students.academic_year_start.value_counts().sort_index()[:-1].plot(kind='bar')
plt.ylabel('Frequency'); plt.xlabel('Academic year');


# In[344]:


disab_by_year = unique_students.groupby('academic_year_start')['synd_or_disab'].value_counts().unstack().fillna(0)
disab_by_year.columns = ['No', 'Yes']
disab_by_year[disab_by_year.index!='nan'].plot(kind='bar', stacked=True)


# The following counts of ages allows for multiple tests per year

# In[345]:


test_age = (lsl_dr.assign(age_test_year=(lsl_dr.age_test/12))
     .dropna(subset=['age_test'])[['study_id','age_test','age_test_year']])
test_age.assign(age_year=test_age.age_test_year.astype(int)).age_year.value_counts().sort_index()


# This summary counts children only once per year:

# In[346]:


from itertools import chain

unique_age_vals = (test_age.assign(age_year=test_age.age_test_year.astype(int))
             .groupby('study_id')
             .age_year.unique().tolist())

pd.Series(np.concatenate(unique_age_vals)).value_counts().sort_index()


# In[347]:


test_age.assign(age_year=test_age.age_test_year.astype(int)).groupby('study_id').age_year.last().value_counts().sort_index()


# In[ ]: