#!/usr/bin/env python # coding: utf-8 # In[2]: # Import modules and set options get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np import seaborn as sns # import pandas_profiling sns.set(context='notebook', style='ticks') # In[3]: analysis_subset = (pd.read_csv('../data/clean/analysis_subset.csv', low_memory=False) .rename({'onset_1':'identify_mo'}, axis=1)) # In[4]: analysis_subset.shape # In[5]: age_mask = (analysis_subset.age_test>=48) & (analysis_subset.age_test<60) # In[6]: analysis_subset[age_mask].drop_duplicates(subset='student_idx').shape # In[7]: data_4yo = analysis_subset[age_mask].drop_duplicates(subset='student_idx').copy() # ## Demographics # In[8]: data_4yo.male.replace({1: 'Male', 0: 'Female', np.nan: 'Missing'}).value_counts() # In[9]: data_4yo.race.replace({0:'White', 1:'Black', 2:'Hispanic', 3:'Asian', 4:'Other', np.nan:'Missing'}).value_counts() # In[10]: data_4yo.premature.replace({True: '>=36 weeks', False: '<36 weeks', np.nan: 'Unknown'}).value_counts() # In[11]: data_4yo.sib.replace({0:'1', 1:'2', 2:'3', 3:'4+', np.nan:'Missing'}).value_counts().sort_index() # In[12]: family_lookup = {0:"Ideal Participation", 1:"Good Participation", 2:"Average Participation", 3:"Below Average", 4:"Limited Participation", np.nan:"Missing"} data_4yo.family_inv.replace(family_lookup).value_counts() # ## Hearing loss # In[13]: data_4yo.deg_hl_below6.isnull().sum() # In[14]: hl_data = data_4yo.set_index('student_idx')[['bilateral_snhl', 'bilateral_ansd', 'bilateral_mixed', 'bilateral_cond', 'bilateral_normal', 'bilateral_unk', 'unilateral_snhl', 'unilateral_ansd', 'unilateral_mixed', 'unilateral_cond', 'unilateral_unk', 'assymetrical']] # One or both parents with hearing loss # In[15]: data_4yo.one_or_both_parent_hl.value_counts() # In[16]: data_4yo.one_or_both_parent_hl.isnull().sum() # Individuals with no hearing loss type # In[17]: hl_data.sum().astype(int).sort_values(ascending=False) # In[18]: hl_data.mean().round(2).sort_values(ascending=False) # In[19]: tech_data = data_4yo.set_index('student_idx')[['bilateral_ha', 'bilateral_ci', 'bimodal', 'bilateral_other', 'unilateral_ha', 'unilateral_ci', 'unilateral_other']] # In[20]: tech_data.sum().astype(int).sort_values(ascending=False) # In[21]: tech_data.mean().round(2).sort_values(ascending=False) # Individuals with no technology type # In[22]: tech_data[tech_data.sum(1)==0].index.values # ## Hearing Loss # In[23]: data_4yo.columns[data_4yo.columns.str.contains('_hl_')] # In[24]: hl_data = data_4yo.set_index('student_idx')[['bilateral_hl_normal', 'bilateral_hl_slight', 'bilateral_hl_mild', 'bilateral_hl_moderate', 'bilateral_hl_modsev', 'bilateral_hl_severe', 'bilateral_hl_profound', 'unilateral_hl_slight', 'unilateral_hl_mild', 'unilateral_hl_moderate', 'unilateral_hl_modsev', 'unilateral_hl_severe', 'unilateral_hl_profound']] # In[25]: hl_data.sum().astype(int).sort_values(ascending=False) # In[26]: hl_data.mean().round(2).sort_values(ascending=False) # ## Summary of scores by domain # In[27]: test_scores = analysis_subset[age_mask].copy() # In[28]: test_scores.loc[(test_scores.domain=='Language') & (test_scores.test_type=='receptive'), 'domain'] = 'Receptive Language' test_scores.loc[(test_scores.domain=='Language') & (test_scores.test_type=='expressive'), 'domain'] = 'Expressive Language' # In[29]: test_scores.groupby('domain').score.describe().round(1) # In[30]: test_scores.domain.unique() # In[31]: test_scores['domain_labels'] = test_scores.domain.replace({'Expressive Vocabulary': 'Expressive\nVocabulary', 'Receptive Vocabulary': 'Receptive\nVocabulary', 'Receptive Language': 'Receptive\nLanguage', 'Expressive Language': 'Expressive\nLanguage', 'Language': 'Total\nLanguage'}) # In[32]: sns.catplot("domain_labels", y="score", data=test_scores) fig = plt.gcf() fig.set_size_inches(12, 5) fig.axes[0].set_xlabel(''); # In[33]: sns.catplot("domain_labels", y="score", data=test_scores, kind="box", color='white', fliersize=0) fig = plt.gcf() fig.set_size_inches(12, 5) fig.axes[0].set_xlabel(''); # ## Domain-specific summaries # # Age of amplification greater than 6mo. # In[34]: (test_scores.assign(age_amp_greater_6=test_scores.age_amp>6) .groupby(['domain', 'age_amp_greater_6']) .score.describe()) # In[35]: sns.catplot("domain_labels", y="score", hue='age_amp_greater_6', data=test_scores.assign(age_amp_greater_6=test_scores.age_amp>6), kind="box", color='white', fliersize=0) fig = plt.gcf() fig.set_size_inches(12, 5) fig.axes[0].set_xlabel(''); # Age of enrollment greater than 6mo. # In[36]: (test_scores.assign(age_enroll_greater_6=test_scores.age_int>6) .groupby(['domain', 'age_enroll_greater_6']) .score.describe()) # In[37]: sns.catplot("domain_labels", y="score", hue='age_enroll_greater_6', data=test_scores.assign(age_enroll_greater_6=test_scores.age_int>6), kind="box", color='white', fliersize=0) fig = plt.gcf() fig.set_size_inches(12, 5) fig.axes[0].set_xlabel(''); # (Non-)Profound hearing loss # In[38]: (test_scores.assign(non_profound_hl=test_scores.deg_hl_below6) .groupby(['domain', 'non_profound_hl']) .score.describe()) # In[39]: sns.catplot("domain_labels", y="score", hue='non_profound_hl', data=test_scores.assign(non_profound_hl=test_scores.deg_hl_below6), kind="box", color='white', fliersize=0) fig = plt.gcf() fig.set_size_inches(12, 5) fig.axes[0].set_xlabel(''); # Mother with(out) college education # In[40]: test_scores.groupby(['domain', 'mother_college']).score.describe() # In[41]: sns.catplot("domain_labels", y="score", hue='mother_college', data=test_scores, kind="box", color='white', fliersize=0) fig = plt.gcf() fig.set_size_inches(12, 5) fig.axes[0].set_xlabel('');