#!/usr/bin/env python
# coding: utf-8

# In[2]:


# Import modules and set options
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
# import pandas_profiling

sns.set(context='notebook', style='ticks')


# In[3]:


analysis_subset = (pd.read_csv('../data/clean/analysis_subset.csv', low_memory=False)
                  .rename({'onset_1':'identify_mo'}, axis=1))


# In[4]:


analysis_subset.shape


# In[5]:


age_mask = (analysis_subset.age_test>=48) & (analysis_subset.age_test<60)


# In[6]:


analysis_subset[age_mask].drop_duplicates(subset='student_idx').shape


# In[7]:


data_4yo = analysis_subset[age_mask].drop_duplicates(subset='student_idx').copy()


# ## Demographics

# In[8]:


data_4yo.male.replace({1: 'Male', 0: 'Female', np.nan: 'Missing'}).value_counts()


# In[9]:


data_4yo.race.replace({0:'White', 1:'Black', 2:'Hispanic', 3:'Asian', 4:'Other', np.nan:'Missing'}).value_counts()


# In[10]:


data_4yo.premature.replace({True: '>=36 weeks', False: '<36 weeks', np.nan: 'Unknown'}).value_counts()


# In[11]:


data_4yo.sib.replace({0:'1', 1:'2', 2:'3', 3:'4+', np.nan:'Missing'}).value_counts().sort_index()


# In[12]:


family_lookup = {0:"Ideal Participation",
1:"Good Participation",
2:"Average Participation",
3:"Below Average",
4:"Limited Participation",
np.nan:"Missing"}

data_4yo.family_inv.replace(family_lookup).value_counts()


# ## Hearing loss

# In[13]:


data_4yo.deg_hl_below6.isnull().sum()


# In[14]:


hl_data = data_4yo.set_index('student_idx')[['bilateral_snhl',
                     'bilateral_ansd',
                     'bilateral_mixed',
                     'bilateral_cond',
                     'bilateral_normal',
                     'bilateral_unk',
                     'unilateral_snhl',
                     'unilateral_ansd',
                     'unilateral_mixed',
                     'unilateral_cond',
                     'unilateral_unk',
                     'assymetrical']]


# One or both parents with hearing loss

# In[15]:


data_4yo.one_or_both_parent_hl.value_counts()


# In[16]:


data_4yo.one_or_both_parent_hl.isnull().sum()


# Individuals with no hearing loss type

# In[17]:


hl_data.sum().astype(int).sort_values(ascending=False)


# In[18]:


hl_data.mean().round(2).sort_values(ascending=False)


# In[19]:


tech_data = data_4yo.set_index('student_idx')[['bilateral_ha',
                     'bilateral_ci',
                     'bimodal',
                     'bilateral_other',
                     'unilateral_ha',
                     'unilateral_ci',
                     'unilateral_other']]


# In[20]:


tech_data.sum().astype(int).sort_values(ascending=False)


# In[21]:


tech_data.mean().round(2).sort_values(ascending=False)


# Individuals with no technology type

# In[22]:


tech_data[tech_data.sum(1)==0].index.values


# ## Hearing Loss

# In[23]:


data_4yo.columns[data_4yo.columns.str.contains('_hl_')]


# In[24]:


hl_data = data_4yo.set_index('student_idx')[['bilateral_hl_normal', 'bilateral_hl_slight', 'bilateral_hl_mild',
       'bilateral_hl_moderate', 'bilateral_hl_modsev', 'bilateral_hl_severe',
       'bilateral_hl_profound', 'unilateral_hl_slight', 'unilateral_hl_mild',
       'unilateral_hl_moderate', 'unilateral_hl_modsev',
       'unilateral_hl_severe', 'unilateral_hl_profound']]


# In[25]:


hl_data.sum().astype(int).sort_values(ascending=False)


# In[26]:


hl_data.mean().round(2).sort_values(ascending=False)


# ## Summary of scores by domain

# In[27]:


test_scores = analysis_subset[age_mask].copy()


# In[28]:


test_scores.loc[(test_scores.domain=='Language') & (test_scores.test_type=='receptive'), 
           'domain'] = 'Receptive Language'
test_scores.loc[(test_scores.domain=='Language') & (test_scores.test_type=='expressive'), 
           'domain'] = 'Expressive Language'


# In[29]:


test_scores.groupby('domain').score.describe().round(1)


# In[30]:


test_scores.domain.unique()


# In[31]:


test_scores['domain_labels'] = test_scores.domain.replace({'Expressive Vocabulary': 'Expressive\nVocabulary', 
                                                           'Receptive Vocabulary': 'Receptive\nVocabulary',
                                                           'Receptive Language': 'Receptive\nLanguage', 
                                                           'Expressive Language': 'Expressive\nLanguage', 
                                                           'Language': 'Total\nLanguage'})


# In[32]:


sns.catplot("domain_labels", y="score", data=test_scores)
fig = plt.gcf()
fig.set_size_inches(12, 5)
fig.axes[0].set_xlabel('');


# In[33]:


sns.catplot("domain_labels", y="score", data=test_scores, kind="box", color='white', fliersize=0)
fig = plt.gcf()
fig.set_size_inches(12, 5)
fig.axes[0].set_xlabel('');


# ## Domain-specific summaries
# 
# Age of amplification greater than 6mo.

# In[34]:


(test_scores.assign(age_amp_greater_6=test_scores.age_amp>6)
         .groupby(['domain', 'age_amp_greater_6'])
         .score.describe())


# In[35]:


sns.catplot("domain_labels", y="score", hue='age_amp_greater_6', data=test_scores.assign(age_amp_greater_6=test_scores.age_amp>6),
            kind="box", color='white', fliersize=0)
fig = plt.gcf()
fig.set_size_inches(12, 5)
fig.axes[0].set_xlabel('');


# Age of enrollment greater than 6mo.

# In[36]:


(test_scores.assign(age_enroll_greater_6=test_scores.age_int>6)
         .groupby(['domain', 'age_enroll_greater_6'])
         .score.describe())


# In[37]:


sns.catplot("domain_labels", y="score", hue='age_enroll_greater_6', data=test_scores.assign(age_enroll_greater_6=test_scores.age_int>6),
            kind="box", color='white', fliersize=0)
fig = plt.gcf()
fig.set_size_inches(12, 5)
fig.axes[0].set_xlabel('');


# (Non-)Profound hearing loss

# In[38]:


(test_scores.assign(non_profound_hl=test_scores.deg_hl_below6)
         .groupby(['domain', 'non_profound_hl'])
         .score.describe())


# In[39]:


sns.catplot("domain_labels", y="score", hue='non_profound_hl', data=test_scores.assign(non_profound_hl=test_scores.deg_hl_below6),
            kind="box", color='white', fliersize=0)
fig = plt.gcf()
fig.set_size_inches(12, 5)
fig.axes[0].set_xlabel('');


# Mother with(out) college education

# In[40]:


test_scores.groupby(['domain', 'mother_college']).score.describe()


# In[41]:


sns.catplot("domain_labels", y="score", hue='mother_college', data=test_scores,
            kind="box", color='white', fliersize=0)
fig = plt.gcf()
fig.set_size_inches(12, 5)
fig.axes[0].set_xlabel('');