#!/usr/bin/env python
# coding: utf-8

# In[17]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
from pylab import *


# In[18]:


# kernel density estimation
from scipy.stats import gaussian_kde


# Apply grayscale first, seaborn whitegrid second to get greyscale plots.

# In[19]:


grayscale = True

if grayscale:
    plt.style.use('grayscale')


# In[20]:


print(style.available)


# In[21]:


get_ipython().run_line_magic('matplotlib', 'inline')
import seaborn as sns
sns.set_style("whitegrid")


# In[22]:


CHRIS = True


# In[23]:


from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
if not CHRIS: 
    api_key = open("/Users/alicetoll/Documents/OPTION/token.txt").read()
else:
    api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)


# In[24]:


articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})


# In[25]:


expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})


# In[26]:


receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})


# In[27]:


language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})


# In[28]:


demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})


# # Language

# 5 language measures:
# * 3 versions of CELF
# * PLS
#     * pls_ac_rs: PLS: Auditory Comprehension Raw Score
#     * pls_ac_ss: PLS: Auditory Comprehension Standard Score
#     * pls_ec_rs: PLS: Expressive Communication Raw Score
#     * pls_ec_ss: PLS: Expressive Communication Standard Score
#     * pls_tl_rs: PLS: Total Language Score Standard Score Total
#     * pls_tl_ss: PLS: Total Language Score Standard Score
# * OWLS
#     * age_test_owls: Age at time of testing (OWLS)
#     * owls_lc_rs: OWLS: Listening Comprehension Raw Score
#     * owls_lc_ss: OWLS: Listening Comprehension Standard Score
#     * owls_oe_rs: OWLS: Oral Expression Raw Score
#     * owls_oe_ss: OWLS: Oral Expression Standard Score
#     * owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
#     * owls_oc_ss: OWLS: Oral Composite Standard Score
#     * owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
#     * owls_wes_as: OWLS: Written Expression Scale Ability Score
#     * owls_wes_ss: OWLS: Written Expression Scale Standard Score
#     * owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
#     * owls_lcss: OWLS: Language Composite Standard Score

# In[29]:


# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))


# In[30]:


language["school"] = language.study_id.str.slice(0,4)
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()


# In[31]:


language['ageGroup'] = None # initial variable to none
language.loc[(language.age_test >= 36) & (language.age_test < 48), 'ageGroup'] = 3 
language.loc[(language.age_test >= 48) & (language.age_test < 60), 'ageGroup'] = 4 
language.loc[(language.age_test >= 60) & (language.age_test < 72), 'ageGroup'] = 5 
language.head()


# ## Expressive Language

# In[32]:


if CHRIS:
    get_ipython().system('mkdir DescriptiveFigures')


# In[33]:


expressive_lang = language.loc[language.test_type=='expressive'].copy()
expressive_lang.head()


# ### Create models for converting scores

# In[34]:


expressive_scores = (expressive_lang.groupby(['study_id','ageGroup'])
                     .apply(lambda x: x.pivot(columns='test_name', values='score')))


# In[35]:


expressive_scores = (expressive_scores.set_index(expressive_scores.index.droplevel(2))
                         .reset_index()
                        .groupby(['study_id','ageGroup'])).apply(max)


# In[36]:


expressive_owls_pls = expressive_scores[['OWLS', 'PLS']].dropna()
expressive_owls_pls.plot.scatter('OWLS', 'PLS')


# In[37]:


from sklearn import linear_model
reg_owls = linear_model.LinearRegression()
reg_owls.fit(expressive_owls_pls.OWLS.values.reshape(-1,1), expressive_owls_pls.PLS.values)


# In[38]:


expressive_lang.test_name.value_counts()


# In[39]:


expressive_lang['old_score'] = expressive_lang.score.copy()
pred_vals = reg_owls.predict(expressive_lang[expressive_lang.test_name=='OWLS'].score.values.reshape(-1,1))
expressive_lang.loc[expressive_lang.test_name=='OWLS', 'score'] = pred_vals


# In[40]:


expressive_celf_pls = expressive_scores[['CELF-P2', 'PLS']].dropna()
expressive_celf_pls.plot.scatter('CELF-P2', 'PLS')


# In[41]:


reg_celf = linear_model.LinearRegression()
reg_celf.fit(expressive_celf_pls['CELF-P2'].values.reshape(-1,1), expressive_celf_pls.PLS.values)


# In[42]:


pred_vals = reg_celf.predict(expressive_lang[expressive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1))
expressive_lang.loc[expressive_lang.test_name=='CELF-P2', 'score'] = pred_vals


# In[43]:


expressive_scores[['CELF-4', 'PLS']].dropna().plot.scatter('CELF-4', 'PLS')


# There arent enough points to fit a model for CELF-2, so we will combine these directly.

# In[44]:


# Test type
expressive_lang["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]


# In[45]:


plt.style.use('grayscale')
sns.set_style("whitegrid")
bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Language')
for i in [1,2,3]:
    y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expLang.png', dpi=300)
    
expressive_lang.ageGroup.value_counts()
# expressive_lang.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])


# In[46]:


bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Language')
for i in [1,2,3]:
    y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expLang.png', dpi=300)


# In[47]:


# need to do this for some reason...
expressive_lang['scoreInt'] = np.array(expressive_lang.score, dtype = 'int')
expressive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len])


# ## Receptive Language

# In[48]:


receptive_lang = language.loc[language.test_type=='receptive'].copy()
receptive_lang.head()


# In[49]:


receptive_lang.test_name.value_counts()


# In[50]:


receptive_scores = (receptive_lang.groupby(['study_id','ageGroup'])
                     .apply(lambda x: x.pivot(columns='test_name', values='score')))


# In[51]:


receptive_scores = (receptive_scores.set_index(receptive_scores.index.droplevel(2))
                         .reset_index()
                        .groupby(['study_id','ageGroup'])).apply(max)


# In[52]:


receptive_owls_pls = receptive_scores[['OWLS', 'PLS']].dropna()
receptive_owls_pls.plot.scatter('OWLS', 'PLS')


# In[53]:


receptive_celfp2_pls = receptive_scores[['CELF-P2', 'PLS']].dropna()
receptive_celfp2_pls.plot.scatter('CELF-P2', 'PLS')


# In[54]:


receptive_celf4_pls = receptive_scores[['CELF-4', 'PLS']].dropna()
receptive_celf4_pls.plot.scatter('CELF-4', 'PLS')


# In[55]:


reg_owls = linear_model.LinearRegression()
reg_owls.fit(receptive_owls_pls.OWLS.values.reshape(-1,1), receptive_owls_pls.PLS.values)


# In[56]:


receptive_lang['old_score'] = receptive_lang.score.copy()
pred_vals = reg_owls.predict(receptive_lang[receptive_lang.test_name=='OWLS'].score.values.reshape(-1,1))
receptive_lang.loc[receptive_lang.test_name=='OWLS', 'score'] = pred_vals


# In[57]:


reg_celf = linear_model.LinearRegression()
reg_celf.fit(receptive_celfp2_pls['CELF-P2'].values.reshape(-1,1), receptive_celfp2_pls.PLS.values)


# In[58]:


pred_vals = reg_celf.predict(receptive_lang[receptive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1))
receptive_lang.loc[receptive_lang.test_name=='CELF-P2', 'score'] = pred_vals


# In[59]:


receptive_lang['scoreInt'] = np.array(receptive_lang.score, dtype = 'int')


# In[60]:


bp = receptive_lang.boxplot(column='scoreInt', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Receptive Language')
for i in [1,2,3]:
    y = receptive_lang.score[receptive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/recLang.png', dpi=300)

receptive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len])


# # Articulation

# In[61]:


# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]


# ### Map Arizonia onto Goldman

# In[62]:


# create indicator variable if a student took either test
articulation.loc[articulation.aaps_ss.notnull() | articulation.gf2_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = articulation.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.aaps_ss.notnull() & temp.gf2_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()


# In[63]:


AAPS = temp.aaps_ss.notnull()
GF2 = temp.gf2_ss.notnull()

temp.loc[AAPS, "test_name"] = "AAPS"
temp.loc[GF2, "test_name"] = "GF2"


# In[64]:


# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')


# In[65]:


reg = linear_model.LinearRegression()
reg.fit(both.aaps_ss.values.reshape(-1,1), both.gf2_ss.values)


# In[66]:


articulation['old_score'] = articulation.score.copy()
pred_vals = reg.predict(articulation[articulation.test_type=='Arizonia'].score.values.reshape(-1,1))
articulation.loc[articulation.test_type=='Arizonia', 'score'] = pred_vals


# In[67]:


articulation["school"] = articulation.study_id.str.slice(0,4)
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())


# In[68]:


articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()


# In[69]:


articulation['ageGroup'] = None # initial variable to none
articulation.loc[(articulation.age_test >= 36) & (articulation.age_test < 48), 'ageGroup'] = 3 
articulation.loc[(articulation.age_test >= 48) & (articulation.age_test < 60), 'ageGroup'] = 4 
articulation.loc[(articulation.age_test >= 60) & (articulation.age_test < 72), 'ageGroup'] = 5 

bp = articulation.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Articulation')
for i in [1,2,3]:
    y = articulation.score[articulation.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/artic.png', dpi=300)

articulation.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])


# # Expressive Vocabulary

# In[70]:


# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]


# In[71]:


expressive.test_type.value_counts()


# ### Map EVT to EOWPVT

# In[72]:


# create indicator variable if a student took either test
expressive.loc[expressive.evt_ss.notnull() | expressive.eowpvt_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = expressive.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.evt_ss.notnull() & temp.eowpvt_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()


# In[73]:


EVT = temp.evt_ss.notnull()
EOWPVT = temp.eowpvt_ss.notnull()

temp.loc[EVT, "test_name"] = "EVT"
temp.loc[EOWPVT, "test_name"] = "EOWPVT"


# In[74]:


# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')


# In[75]:


del temp


# In[76]:


reg = linear_model.LinearRegression()
reg.fit(both.evt_ss.values.reshape(-1,1), both.eowpvt_ss.values)


# In[77]:


expressive['old_score'] = expressive.score.copy()
pred_vals = reg.predict(expressive[expressive.test_type=='EVT'].score.values.reshape(-1,1))
expressive.loc[expressive.test_type=='EVT', 'score'] = pred_vals


# In[78]:


expressive["school"] = expressive.study_id.str.slice(0,4)


# In[79]:


expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]


# In[80]:


expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()


# In[81]:


expressive['ageGroup'] = None # initial variable to none
expressive.loc[(expressive.age_test >= 36) & (expressive.age_test < 48), 'ageGroup'] = 3 
expressive.loc[(expressive.age_test >= 48) & (expressive.age_test < 60), 'ageGroup'] = 4 
expressive.loc[(expressive.age_test >= 60) & (expressive.age_test < 72), 'ageGroup'] = 5 

bp = expressive.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Vocabulary')
for i in [1,2,3]:
    y = expressive.score[expressive.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expVocab.png', dpi=300)

expressive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])


# # Receptive Vocabulary

# In[82]:


receptive.columns


# In[83]:


# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]


# ### Map PPVT onto ROWPVT

# In[84]:


# create indicator variable if a student took either test
receptive.loc[receptive.ppvt_ss.notnull() | receptive.rowpvt_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = receptive.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.ppvt_ss.notnull() & temp.rowpvt_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()


# In[85]:


PPVT = temp.ppvt_ss.notnull()
ROWPVT = temp.rowpvt_ss.notnull()

temp.loc[PPVT, "test_name"] = "PPVT"
temp.loc[ROWPVT, "test_name"] = "ROWPVT"


# In[86]:


# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')


# In[87]:


del temp


# In[88]:


rv_reg = linear_model.LinearRegression()
rv_reg.fit(both.ppvt_ss.values.reshape(-1,1), both.rowpvt_ss.values)


# In[89]:


receptive['old_score'] = receptive.score.copy()
pred_vals = reg.predict(receptive[receptive.test_type=='PPVT'].score.values.reshape(-1,1))
receptive.loc[receptive.test_type=='PPVT', 'score'] = pred_vals


# In[90]:


receptive["school"] = receptive.study_id.str.slice(0,4)


# In[91]:


receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]


# In[92]:


print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))


# In[93]:


receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()


# In[94]:


receptive.study_id.unique().shape


# In[95]:


receptive['ageGroup'] = None # initial variable to none
receptive.loc[(receptive.age_test >= 36) & (receptive.age_test < 48), 'ageGroup'] = 3 
receptive.loc[(receptive.age_test >= 48) & (receptive.age_test < 60), 'ageGroup'] = 4 
receptive.loc[(receptive.age_test >= 60) & (receptive.age_test < 72), 'ageGroup'] = 5 


# In[96]:


bp = receptive.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Receptive Vocabulary')
for i in [1,2,3]:
    y = receptive.score[receptive.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/recVocab.png', dpi=300)

receptive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])


# In[97]:


width = 3
sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds")
sns.kdeplot(np.array(receptive[receptive.ageGroup ==4].score), bw=width, label = "4-year-olds")
sns.kdeplot(np.array(receptive[receptive.ageGroup ==5].score), bw=width, label = "5-year-olds")


# In[98]:


receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=2)
# plt.legend(bbox_to_anchor=(1.05, 1),loc=2, borderaxespad=0.)
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2),
#           ncol=3, fancybox=True, shadow=True)


# # Merge Datasets

# In[99]:


test_scores = pd.concat([articulation, expressive, receptive, language])
test_scores.head()


# In[100]:


print(test_scores.test_type.value_counts())
print(test_scores.domain.value_counts())


# In[101]:


# make a new domain variable that contains only 5 categories
test_scores['domain2'] = test_scores.domain.copy()
test_scores.loc[test_scores.test_type == 'expressive', 'domain2'] = 'Expressive Language'
test_scores.loc[test_scores.test_type == 'receptive', 'domain2'] = 'Receptive Language'
test_scores.domain2.value_counts()


# In[102]:


# make a dataframe that contains only 3, 4, and 5-year-olds
test_scores_345 = test_scores.loc[(test_scores.age_test >= 36) & (test_scores.age_test < 72), ]
test_scores_345.domain2.value_counts()


# In[124]:


# Attempt to make black and white
bp = test_scores_345.boxplot(column='score', by='domain2', grid=False, sym='')
plt.title('Test scores for 3, 4, and 5-year-olds by functional outcome')
plt.suptitle("")
plt.xlabel(''); plt.ylabel('Standard score');
plt.xticks([1, 2, 3, 4, 5], ['Articulation', 'Expressive\nLanguage', 'Expressive\nVocabulary',
                            'Receptive\nLanguage', 'Receptive\nVocabulary'])
# Articulation
y = test_scores_345.score[test_scores_345.domain2=='Articulation'].dropna()
x = np.random.normal(1, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)

# Expressive Language
y = test_scores_345.score[test_scores_345.domain2=='Expressive Language'].dropna()
x = np.random.normal(2, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

# Expressive Vocabulary
y = test_scores_345.score[test_scores_345.domain2=='Expressive Vocabulary'].dropna()
x = np.random.normal(3, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)  

# Receptive Language
y = test_scores_345.score[test_scores_345.domain2=='Receptive Language'].dropna()
x = np.random.normal(4, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

# Receptive Vocabulary
y = test_scores_345.score[test_scores_345.domain2=='Receptive Vocabulary'].dropna()
x = np.random.normal(5, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

plt.axhline(y=85)

# for components in bp.keys():
#    for line in bp[components]:
#        line.set_color('black')     # black lines


# ## Histograms

# * Blue = three year olds
# * Green = four year olds
# * Red = five year olds
# 
# Need to make in black and white.  
# Need to make legend

# ## Expressive Language

# In[104]:


sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=False, hist=True, norm_hist=False);


# In[105]:


fig = sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=True, hist=False)
plt.yticks(fig.get_yticks(), fig.get_yticks() * 6735)
plt.ylabel('Counts', fontsize=16)


# In[106]:


sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds")
plt.ylabel('Density')
plt.yticks([0, 0.01, 0.02]);
# plt.yticks(fig.get_yticks(), fig.get_yticks() * 100)


# In[107]:


def kdeplot(domain, width, score='score'):
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==3, score]), bw=width, label = "3-year-olds")
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==4, score]), bw=width, label = "4-year-olds")
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==5, score]), bw=width, label = "5-year-olds")
    plt.ylabel('Density')


# In[108]:


kdeplot(expressive_lang, 4)


# In[109]:


expressive_lang[expressive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
expressive_lang[expressive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
expressive_lang[expressive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=1)


# ## Receptive Language

# In[110]:


kdeplot(receptive_lang, 4)


# In[111]:


receptive_lang[receptive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive_lang[receptive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
receptive_lang[receptive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=1)


# ## Articulation

# In[112]:


kdeplot(articulation, 4)


# In[113]:


articulation[articulation.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
articulation[articulation.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
articulation[articulation.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=2)


# ## Expressive Vocab

# In[114]:


kdeplot(expressive, 4)


# In[115]:


expressive[expressive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
expressive[expressive.ageGroup ==4].score.hist(alpha=0.3, label='3-year-olds')
expressive[expressive.ageGroup ==5].score.hist(alpha=0.3, label='3-year-olds')
plt.legend(loc=2)


# ## Receptive Vocab

# In[116]:


kdeplot(receptive, 4)


# In[117]:


receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='3-year-olds')
plt.legend(loc=2)


# In[118]:


def kdesubplot(domain, width, ax):
    sns.kdeplot(np.array(domain[domain.ageGroup ==3].score), bw=width, label = "3-year-olds", ax=ax)
    sns.kdeplot(np.array(domain[domain.ageGroup ==4].score), bw=width, label = "4-year-olds", ax=ax)
    sns.kdeplot(np.array(domain[domain.ageGroup ==5].score), bw=width, label = "5-year-olds", ax=ax)
    plt.ylabel('Density')


# In[122]:


domains = [expressive_lang, receptive_lang, expressive, receptive, articulation]
names = ['Expressive language', 'Receptive language', 'Expressive vocabulary', 'Receptive vocabulary', 'Articulation']

fig, axes = plt.subplots(3,2, figsize=(10,6))
for i,ax in enumerate(axes.ravel()[:-1]):
    kdesubplot(domains[i], 4, ax)
    ax.set_title(names[i])
    ax.set_yticks([])
    if i==2:
        ax.set_ylabel('Density')
    if i>2:
        ax.set_xlabel('Score')
    if i:
        ax.legend_.remove()


axes[-1,-1].axis('off')
plt.tight_layout()
# fig.text(0, 0.5, 'Density', va='center', rotation='vertical')


# In[120]:


plt.figure()
plt.subplot(3,2,1)
kdeplot(expressive_lang, 4)
plt.title('Expressive Language')
plt.subplot(3,2,2)
kdeplot(receptive_lang, 4)
plt.title('Receptive Language')
plt.subplot(3,2,3)
kdeplot(expressive, 4)
plt.title('Expressive Vocabulary')
plt.subplot(3,2,4)
kdeplot(receptive, 4)
plt.title('Receptive Vocabulary')
plt.subplot(3,2,5)
kdeplot(articulation, 4)
plt.title('Articulation')

plt.tight_layout()