#!/usr/bin/env python # coding: utf-8 # In[17]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import matplotlib.cm as cm import pandas as pd import numpy as np from pylab import * # In[18]: # kernel density estimation from scipy.stats import gaussian_kde # Apply grayscale first, seaborn whitegrid second to get greyscale plots. # In[19]: grayscale = True if grayscale: plt.style.use('grayscale') # In[20]: print(style.available) # In[21]: get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns sns.set_style("whitegrid") # In[22]: CHRIS = True # In[23]: from redcap import Project api_url = 'https://redcap.vanderbilt.edu/api/' if not CHRIS: api_key = open("/Users/alicetoll/Documents/OPTION/token.txt").read() else: api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read() lsl_dr_project = Project(api_url, api_key) # In[24]: articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss'] articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[25]: expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss'] expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[26]: receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss'] receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[27]: language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls', 'owls_lc_ss','owls_oe_ss','age_test_owls', 'celfp_rl_ss','celfp_el_ss','age_test_celp', 'celf_elss','celf_rlss','age_test_celf'] language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[28]: demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year', 'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race', 'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as', 'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae', 'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes', 'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age', 'att_days_hr', 'att_days_sch', 'att_days_st2_417'] demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', df_kwargs={'index_col':None, 'na_values':[888, 999, 9999]}) # # Language # 5 language measures: # * 3 versions of CELF # * PLS # * pls_ac_rs: PLS: Auditory Comprehension Raw Score # * pls_ac_ss: PLS: Auditory Comprehension Standard Score # * pls_ec_rs: PLS: Expressive Communication Raw Score # * pls_ec_ss: PLS: Expressive Communication Standard Score # * pls_tl_rs: PLS: Total Language Score Standard Score Total # * pls_tl_ss: PLS: Total Language Score Standard Score # * OWLS # * age_test_owls: Age at time of testing (OWLS) # * owls_lc_rs: OWLS: Listening Comprehension Raw Score # * owls_lc_ss: OWLS: Listening Comprehension Standard Score # * owls_oe_rs: OWLS: Oral Expression Raw Score # * owls_oe_ss: OWLS: Oral Expression Standard Score # * owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores # * owls_oc_ss: OWLS: Oral Composite Standard Score # * owls_wes_trs: OWLS: Written Expression Scale Total Raw Score # * owls_wes_as: OWLS: Written Expression Scale Ability Score # * owls_wes_ss: OWLS: Written Expression Scale Standard Score # * owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score) # * owls_lcss: OWLS: Language Composite Standard Score # In[29]: # Test type language_raw["test_name"] = None language_raw["test_type"] = None language_raw["score"] = None CELP = language_raw.age_test_celp.notnull() CELF = language_raw.age_test_celf.notnull() PLS = language_raw.age_test_pls.notnull() OWLS = language_raw.age_test_owls.notnull() language_raw['age_test'] = None language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls language1 = language_raw[CELP | CELF | PLS | OWLS].copy() language2 = language1.copy() language1["test_type"] = "receptive" language1.loc[CELP, "test_name"] = "CELF-P2" language1.loc[CELF, "test_name"] = "CELF-4" language1.loc[PLS, "test_name"] = "PLS" language1.loc[OWLS, "test_name"] = "OWLS" language1.loc[CELP, "score"] = language1.celfp_rl_ss language1.loc[CELF, "score"] = language1.celf_rlss language1.loc[PLS, "score"] = language1.pls_ac_ss language1.loc[OWLS, "score"] = language1.owls_lc_ss language2["test_type"] = "expressive" language2.loc[CELP, "test_name"] = "CELF-P2" language2.loc[CELF, "test_name"] = "CELF-4" language2.loc[PLS, "test_name"] = "PLS" language2.loc[OWLS, "test_name"] = "OWLS" language2.loc[CELP, "score"] = language1.celfp_el_ss language2.loc[CELF, "score"] = language1.celf_elss language2.loc[PLS, "score"] = language1.pls_ec_ss language2.loc[OWLS, "score"] = language1.owls_oe_ss language = pd.concat([language1, language2]) language = language[language.score.notnull()] print(pd.crosstab(language.test_name, language.test_type)) print("There are {0} null values for score".format(sum(language["score"].isnull()))) # In[30]: language["school"] = language.study_id.str.slice(0,4) language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]] language["domain"] = "Language" language.head() # In[31]: language['ageGroup'] = None # initial variable to none language.loc[(language.age_test >= 36) & (language.age_test < 48), 'ageGroup'] = 3 language.loc[(language.age_test >= 48) & (language.age_test < 60), 'ageGroup'] = 4 language.loc[(language.age_test >= 60) & (language.age_test < 72), 'ageGroup'] = 5 language.head() # ## Expressive Language # In[32]: if CHRIS: get_ipython().system('mkdir DescriptiveFigures') # In[33]: expressive_lang = language.loc[language.test_type=='expressive'].copy() expressive_lang.head() # ### Create models for converting scores # In[34]: expressive_scores = (expressive_lang.groupby(['study_id','ageGroup']) .apply(lambda x: x.pivot(columns='test_name', values='score'))) # In[35]: expressive_scores = (expressive_scores.set_index(expressive_scores.index.droplevel(2)) .reset_index() .groupby(['study_id','ageGroup'])).apply(max) # In[36]: expressive_owls_pls = expressive_scores[['OWLS', 'PLS']].dropna() expressive_owls_pls.plot.scatter('OWLS', 'PLS') # In[37]: from sklearn import linear_model reg_owls = linear_model.LinearRegression() reg_owls.fit(expressive_owls_pls.OWLS.values.reshape(-1,1), expressive_owls_pls.PLS.values) # In[38]: expressive_lang.test_name.value_counts() # In[39]: expressive_lang['old_score'] = expressive_lang.score.copy() pred_vals = reg_owls.predict(expressive_lang[expressive_lang.test_name=='OWLS'].score.values.reshape(-1,1)) expressive_lang.loc[expressive_lang.test_name=='OWLS', 'score'] = pred_vals # In[40]: expressive_celf_pls = expressive_scores[['CELF-P2', 'PLS']].dropna() expressive_celf_pls.plot.scatter('CELF-P2', 'PLS') # In[41]: reg_celf = linear_model.LinearRegression() reg_celf.fit(expressive_celf_pls['CELF-P2'].values.reshape(-1,1), expressive_celf_pls.PLS.values) # In[42]: pred_vals = reg_celf.predict(expressive_lang[expressive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1)) expressive_lang.loc[expressive_lang.test_name=='CELF-P2', 'score'] = pred_vals # In[43]: expressive_scores[['CELF-4', 'PLS']].dropna().plot.scatter('CELF-4', 'PLS') # There arent enough points to fit a model for CELF-2, so we will combine these directly. # In[44]: # Test type expressive_lang["test_type"] = None ARIZ = articulation.aaps_ss.notnull() GF = articulation.gf2_ss.notnull() articulation = articulation[ARIZ | GF] articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman" articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia" articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman" print(articulation.test_type.value_counts()) print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull()))) # Test score (Arizonia if both) articulation["score"] = articulation.aaps_ss articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF] # In[45]: plt.style.use('grayscale') sns.set_style("whitegrid") bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='') plt.xlabel('Age (years)'); plt.ylabel('Standard score'); plt.suptitle('Expressive Language') for i in [1,2,3]: y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna() # Add some random "jitter" to the x-axis x = np.random.normal(i, 0.04, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.05) plt.savefig('DescriptiveFigures/expLang.png', dpi=300) expressive_lang.ageGroup.value_counts() # expressive_lang.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len]) # In[46]: bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='') plt.xlabel('Age (years)'); plt.ylabel('Standard score'); plt.suptitle('Expressive Language') for i in [1,2,3]: y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna() # Add some random "jitter" to the x-axis x = np.random.normal(i, 0.04, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.05) plt.savefig('DescriptiveFigures/expLang.png', dpi=300) # In[47]: # need to do this for some reason... expressive_lang['scoreInt'] = np.array(expressive_lang.score, dtype = 'int') expressive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len]) # ## Receptive Language # In[48]: receptive_lang = language.loc[language.test_type=='receptive'].copy() receptive_lang.head() # In[49]: receptive_lang.test_name.value_counts() # In[50]: receptive_scores = (receptive_lang.groupby(['study_id','ageGroup']) .apply(lambda x: x.pivot(columns='test_name', values='score'))) # In[51]: receptive_scores = (receptive_scores.set_index(receptive_scores.index.droplevel(2)) .reset_index() .groupby(['study_id','ageGroup'])).apply(max) # In[52]: receptive_owls_pls = receptive_scores[['OWLS', 'PLS']].dropna() receptive_owls_pls.plot.scatter('OWLS', 'PLS') # In[53]: receptive_celfp2_pls = receptive_scores[['CELF-P2', 'PLS']].dropna() receptive_celfp2_pls.plot.scatter('CELF-P2', 'PLS') # In[54]: receptive_celf4_pls = receptive_scores[['CELF-4', 'PLS']].dropna() receptive_celf4_pls.plot.scatter('CELF-4', 'PLS') # In[55]: reg_owls = linear_model.LinearRegression() reg_owls.fit(receptive_owls_pls.OWLS.values.reshape(-1,1), receptive_owls_pls.PLS.values) # In[56]: receptive_lang['old_score'] = receptive_lang.score.copy() pred_vals = reg_owls.predict(receptive_lang[receptive_lang.test_name=='OWLS'].score.values.reshape(-1,1)) receptive_lang.loc[receptive_lang.test_name=='OWLS', 'score'] = pred_vals # In[57]: reg_celf = linear_model.LinearRegression() reg_celf.fit(receptive_celfp2_pls['CELF-P2'].values.reshape(-1,1), receptive_celfp2_pls.PLS.values) # In[58]: pred_vals = reg_celf.predict(receptive_lang[receptive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1)) receptive_lang.loc[receptive_lang.test_name=='CELF-P2', 'score'] = pred_vals # In[59]: receptive_lang['scoreInt'] = np.array(receptive_lang.score, dtype = 'int') # In[60]: bp = receptive_lang.boxplot(column='scoreInt', by='ageGroup', grid=False, sym='') plt.xlabel('Age (years)'); plt.ylabel('Standard score'); plt.suptitle('Receptive Language') for i in [1,2,3]: y = receptive_lang.score[receptive_lang.ageGroup==i+2].dropna() # Add some random "jitter" to the x-axis x = np.random.normal(i, 0.04, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.05) plt.savefig('DescriptiveFigures/recLang.png', dpi=300) receptive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len]) # # Articulation # In[61]: # Test type articulation["test_type"] = None ARIZ = articulation.aaps_ss.notnull() GF = articulation.gf2_ss.notnull() articulation = articulation[ARIZ | GF] articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman" articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia" articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman" print(articulation.test_type.value_counts()) print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull()))) # Test score (Arizonia if both) articulation["score"] = articulation.aaps_ss articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF] # ### Map Arizonia onto Goldman # In[62]: # create indicator variable if a student took either test articulation.loc[articulation.aaps_ss.notnull() | articulation.gf2_ss.notnull(), 'test'] = 1 # drop observations when neither test was taken temp = articulation.dropna(subset = ['test']) # Can drop test variable temp = temp.drop('test', 1) # Create new variable if student took both tests in one observation temp['both'] = 0 temp.loc[temp.aaps_ss.notnull() & temp.gf2_ss.notnull(), 'both'] = 1 print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one # temp.head() # In[63]: AAPS = temp.aaps_ss.notnull() GF2 = temp.gf2_ss.notnull() temp.loc[AAPS, "test_name"] = "AAPS" temp.loc[GF2, "test_name"] = "GF2" # In[64]: # One test single = temp.loc[temp.both==0,] a = single.shape[0] single = single.groupby('study_id').last() b = single.shape[0] print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students') # Both tests both = temp.loc[temp.both==1,] a = both.shape[0] both = both.groupby('study_id').last() b = both.shape[0] print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students') # In[65]: reg = linear_model.LinearRegression() reg.fit(both.aaps_ss.values.reshape(-1,1), both.gf2_ss.values) # In[66]: articulation['old_score'] = articulation.score.copy() pred_vals = reg.predict(articulation[articulation.test_type=='Arizonia'].score.values.reshape(-1,1)) articulation.loc[articulation.test_type=='Arizonia', 'score'] = pred_vals # In[67]: articulation["school"] = articulation.study_id.str.slice(0,4) articulation["age_test"] = articulation.age_test_aaps articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()] print(articulation.age_test.describe()) # In[68]: articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1) articulation["domain"] = "Articulation" articulation.head() # In[69]: articulation['ageGroup'] = None # initial variable to none articulation.loc[(articulation.age_test >= 36) & (articulation.age_test < 48), 'ageGroup'] = 3 articulation.loc[(articulation.age_test >= 48) & (articulation.age_test < 60), 'ageGroup'] = 4 articulation.loc[(articulation.age_test >= 60) & (articulation.age_test < 72), 'ageGroup'] = 5 bp = articulation.boxplot(column='score', by='ageGroup', grid=False, sym='') plt.xlabel('Age (years)'); plt.ylabel('Standard score'); plt.suptitle('Articulation') for i in [1,2,3]: y = articulation.score[articulation.ageGroup==i+2].dropna() # Add some random "jitter" to the x-axis x = np.random.normal(i, 0.04, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.05) plt.savefig('DescriptiveFigures/artic.png', dpi=300) articulation.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len]) # # Expressive Vocabulary # In[70]: # Test type expressive["test_type"] = None EOWPVT = expressive.eowpvt_ss.notnull() EVT = expressive.evt_ss.notnull() expressive = expressive[EOWPVT | EVT] expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT" expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT" expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT" print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull()))) expressive["score"] = expressive.eowpvt_ss expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT] # In[71]: expressive.test_type.value_counts() # ### Map EVT to EOWPVT # In[72]: # create indicator variable if a student took either test expressive.loc[expressive.evt_ss.notnull() | expressive.eowpvt_ss.notnull(), 'test'] = 1 # drop observations when neither test was taken temp = expressive.dropna(subset = ['test']) # Can drop test variable temp = temp.drop('test', 1) # Create new variable if student took both tests in one observation temp['both'] = 0 temp.loc[temp.evt_ss.notnull() & temp.eowpvt_ss.notnull(), 'both'] = 1 print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one # temp.head() # In[73]: EVT = temp.evt_ss.notnull() EOWPVT = temp.eowpvt_ss.notnull() temp.loc[EVT, "test_name"] = "EVT" temp.loc[EOWPVT, "test_name"] = "EOWPVT" # In[74]: # One test single = temp.loc[temp.both==0,] a = single.shape[0] single = single.groupby('study_id').last() b = single.shape[0] print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students') # Both tests both = temp.loc[temp.both==1,] a = both.shape[0] both = both.groupby('study_id').last() b = both.shape[0] print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students') # In[75]: del temp # In[76]: reg = linear_model.LinearRegression() reg.fit(both.evt_ss.values.reshape(-1,1), both.eowpvt_ss.values) # In[77]: expressive['old_score'] = expressive.score.copy() pred_vals = reg.predict(expressive[expressive.test_type=='EVT'].score.values.reshape(-1,1)) expressive.loc[expressive.test_type=='EVT', 'score'] = pred_vals # In[78]: expressive["school"] = expressive.study_id.str.slice(0,4) # In[79]: expressive["age_test"] = expressive.age_test_eowpvt expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()] # In[80]: expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]] expressive["domain"] = "Expressive Vocabulary" expressive.head() # In[81]: expressive['ageGroup'] = None # initial variable to none expressive.loc[(expressive.age_test >= 36) & (expressive.age_test < 48), 'ageGroup'] = 3 expressive.loc[(expressive.age_test >= 48) & (expressive.age_test < 60), 'ageGroup'] = 4 expressive.loc[(expressive.age_test >= 60) & (expressive.age_test < 72), 'ageGroup'] = 5 bp = expressive.boxplot(column='score', by='ageGroup', grid=False, sym='') plt.xlabel('Age (years)'); plt.ylabel('Standard score'); plt.suptitle('Expressive Vocabulary') for i in [1,2,3]: y = expressive.score[expressive.ageGroup==i+2].dropna() # Add some random "jitter" to the x-axis x = np.random.normal(i, 0.04, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.05) plt.savefig('DescriptiveFigures/expVocab.png', dpi=300) expressive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len]) # # Receptive Vocabulary # In[82]: receptive.columns # In[83]: # Test type receptive["test_type"] = None PPVT = receptive.ppvt_ss.notnull() ROWPVT = receptive.rowpvt_ss.notnull() receptive = receptive[PPVT | ROWPVT] receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT" receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT" receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT" print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull()))) receptive["score"] = receptive.ppvt_ss receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT] # ### Map PPVT onto ROWPVT # In[84]: # create indicator variable if a student took either test receptive.loc[receptive.ppvt_ss.notnull() | receptive.rowpvt_ss.notnull(), 'test'] = 1 # drop observations when neither test was taken temp = receptive.dropna(subset = ['test']) # Can drop test variable temp = temp.drop('test', 1) # Create new variable if student took both tests in one observation temp['both'] = 0 temp.loc[temp.ppvt_ss.notnull() & temp.rowpvt_ss.notnull(), 'both'] = 1 print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one # temp.head() # In[85]: PPVT = temp.ppvt_ss.notnull() ROWPVT = temp.rowpvt_ss.notnull() temp.loc[PPVT, "test_name"] = "PPVT" temp.loc[ROWPVT, "test_name"] = "ROWPVT" # In[86]: # One test single = temp.loc[temp.both==0,] a = single.shape[0] single = single.groupby('study_id').last() b = single.shape[0] print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students') # Both tests both = temp.loc[temp.both==1,] a = both.shape[0] both = both.groupby('study_id').last() b = both.shape[0] print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students') # In[87]: del temp # In[88]: rv_reg = linear_model.LinearRegression() rv_reg.fit(both.ppvt_ss.values.reshape(-1,1), both.rowpvt_ss.values) # In[89]: receptive['old_score'] = receptive.score.copy() pred_vals = reg.predict(receptive[receptive.test_type=='PPVT'].score.values.reshape(-1,1)) receptive.loc[receptive.test_type=='PPVT', 'score'] = pred_vals # In[90]: receptive["school"] = receptive.study_id.str.slice(0,4) # In[91]: receptive["age_test"] = receptive.age_test_ppvt receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()] # In[92]: print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull()))) # In[93]: receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]] receptive["domain"] = "Receptive Vocabulary" receptive.head() # In[94]: receptive.study_id.unique().shape # In[95]: receptive['ageGroup'] = None # initial variable to none receptive.loc[(receptive.age_test >= 36) & (receptive.age_test < 48), 'ageGroup'] = 3 receptive.loc[(receptive.age_test >= 48) & (receptive.age_test < 60), 'ageGroup'] = 4 receptive.loc[(receptive.age_test >= 60) & (receptive.age_test < 72), 'ageGroup'] = 5 # In[96]: bp = receptive.boxplot(column='score', by='ageGroup', grid=False, sym='') plt.xlabel('Age (years)'); plt.ylabel('Standard score'); plt.suptitle('Receptive Vocabulary') for i in [1,2,3]: y = receptive.score[receptive.ageGroup==i+2].dropna() # Add some random "jitter" to the x-axis x = np.random.normal(i, 0.04, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.05) plt.savefig('DescriptiveFigures/recVocab.png', dpi=300) receptive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len]) # In[97]: width = 3 sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds") sns.kdeplot(np.array(receptive[receptive.ageGroup ==4].score), bw=width, label = "4-year-olds") sns.kdeplot(np.array(receptive[receptive.ageGroup ==5].score), bw=width, label = "5-year-olds") # In[98]: receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds') receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds') receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds') plt.legend(loc=2) # plt.legend(bbox_to_anchor=(1.05, 1),loc=2, borderaxespad=0.) # plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2), # ncol=3, fancybox=True, shadow=True) # # Merge Datasets # In[99]: test_scores = pd.concat([articulation, expressive, receptive, language]) test_scores.head() # In[100]: print(test_scores.test_type.value_counts()) print(test_scores.domain.value_counts()) # In[101]: # make a new domain variable that contains only 5 categories test_scores['domain2'] = test_scores.domain.copy() test_scores.loc[test_scores.test_type == 'expressive', 'domain2'] = 'Expressive Language' test_scores.loc[test_scores.test_type == 'receptive', 'domain2'] = 'Receptive Language' test_scores.domain2.value_counts() # In[102]: # make a dataframe that contains only 3, 4, and 5-year-olds test_scores_345 = test_scores.loc[(test_scores.age_test >= 36) & (test_scores.age_test < 72), ] test_scores_345.domain2.value_counts() # In[124]: # Attempt to make black and white bp = test_scores_345.boxplot(column='score', by='domain2', grid=False, sym='') plt.title('Test scores for 3, 4, and 5-year-olds by functional outcome') plt.suptitle("") plt.xlabel(''); plt.ylabel('Standard score'); plt.xticks([1, 2, 3, 4, 5], ['Articulation', 'Expressive\nLanguage', 'Expressive\nVocabulary', 'Receptive\nLanguage', 'Receptive\nVocabulary']) # Articulation y = test_scores_345.score[test_scores_345.domain2=='Articulation'].dropna() x = np.random.normal(1, 0.08, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.02) # Expressive Language y = test_scores_345.score[test_scores_345.domain2=='Expressive Language'].dropna() x = np.random.normal(2, 0.08, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.02) # Expressive Vocabulary y = test_scores_345.score[test_scores_345.domain2=='Expressive Vocabulary'].dropna() x = np.random.normal(3, 0.08, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.02) # Receptive Language y = test_scores_345.score[test_scores_345.domain2=='Receptive Language'].dropna() x = np.random.normal(4, 0.08, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.02) # Receptive Vocabulary y = test_scores_345.score[test_scores_345.domain2=='Receptive Vocabulary'].dropna() x = np.random.normal(5, 0.08, size=len(y)) plt.plot(x, y.values, 'k.', alpha=0.02) plt.axhline(y=85) # for components in bp.keys(): # for line in bp[components]: # line.set_color('black') # black lines # ## Histograms # * Blue = three year olds # * Green = four year olds # * Red = five year olds # # Need to make in black and white. # Need to make legend # ## Expressive Language # In[104]: sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=False, hist=True, norm_hist=False); # In[105]: fig = sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=True, hist=False) plt.yticks(fig.get_yticks(), fig.get_yticks() * 6735) plt.ylabel('Counts', fontsize=16) # In[106]: sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds") plt.ylabel('Density') plt.yticks([0, 0.01, 0.02]); # plt.yticks(fig.get_yticks(), fig.get_yticks() * 100) # In[107]: def kdeplot(domain, width, score='score'): sns.kdeplot(np.array(domain.loc[domain.ageGroup ==3, score]), bw=width, label = "3-year-olds") sns.kdeplot(np.array(domain.loc[domain.ageGroup ==4, score]), bw=width, label = "4-year-olds") sns.kdeplot(np.array(domain.loc[domain.ageGroup ==5, score]), bw=width, label = "5-year-olds") plt.ylabel('Density') # In[108]: kdeplot(expressive_lang, 4) # In[109]: expressive_lang[expressive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds') expressive_lang[expressive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds') expressive_lang[expressive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds') plt.legend(loc=1) # ## Receptive Language # In[110]: kdeplot(receptive_lang, 4) # In[111]: receptive_lang[receptive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds') receptive_lang[receptive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds') receptive_lang[receptive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds') plt.legend(loc=1) # ## Articulation # In[112]: kdeplot(articulation, 4) # In[113]: articulation[articulation.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds') articulation[articulation.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds') articulation[articulation.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds') plt.legend(loc=2) # ## Expressive Vocab # In[114]: kdeplot(expressive, 4) # In[115]: expressive[expressive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds') expressive[expressive.ageGroup ==4].score.hist(alpha=0.3, label='3-year-olds') expressive[expressive.ageGroup ==5].score.hist(alpha=0.3, label='3-year-olds') plt.legend(loc=2) # ## Receptive Vocab # In[116]: kdeplot(receptive, 4) # In[117]: receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds') receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='3-year-olds') receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='3-year-olds') plt.legend(loc=2) # In[118]: def kdesubplot(domain, width, ax): sns.kdeplot(np.array(domain[domain.ageGroup ==3].score), bw=width, label = "3-year-olds", ax=ax) sns.kdeplot(np.array(domain[domain.ageGroup ==4].score), bw=width, label = "4-year-olds", ax=ax) sns.kdeplot(np.array(domain[domain.ageGroup ==5].score), bw=width, label = "5-year-olds", ax=ax) plt.ylabel('Density') # In[122]: domains = [expressive_lang, receptive_lang, expressive, receptive, articulation] names = ['Expressive language', 'Receptive language', 'Expressive vocabulary', 'Receptive vocabulary', 'Articulation'] fig, axes = plt.subplots(3,2, figsize=(10,6)) for i,ax in enumerate(axes.ravel()[:-1]): kdesubplot(domains[i], 4, ax) ax.set_title(names[i]) ax.set_yticks([]) if i==2: ax.set_ylabel('Density') if i>2: ax.set_xlabel('Score') if i: ax.legend_.remove() axes[-1,-1].axis('off') plt.tight_layout() # fig.text(0, 0.5, 'Density', va='center', rotation='vertical') # In[120]: plt.figure() plt.subplot(3,2,1) kdeplot(expressive_lang, 4) plt.title('Expressive Language') plt.subplot(3,2,2) kdeplot(receptive_lang, 4) plt.title('Receptive Language') plt.subplot(3,2,3) kdeplot(expressive, 4) plt.title('Expressive Vocabulary') plt.subplot(3,2,4) kdeplot(receptive, 4) plt.title('Receptive Vocabulary') plt.subplot(3,2,5) kdeplot(articulation, 4) plt.title('Articulation') plt.tight_layout()