#!/usr/bin/env python # coding: utf-8 # # Validation of Articulation tests # Contents # 1. [Problem background](#bkgnd) # 2. [Data Prep](#dataprep) # 3. [Students who took one test](#one) # 4. [Students who took both tests](#both) # # # Questions and Background # Students can either take the Arizona or Goldman-Fristoe test # **Question:** Can these two tests be combined? # Similar studentss should score the same. # How should we define "similar students"? # For now I will consider the variable included in our population characteristics summary table # Covariate | Redcap Variable # ----------|-------------------- # Age (Arizona) | `age_test_aaps` # Age (Goldman Firstoe) | `age_test_gf2` # Type of hearing loss | `type_hl_as`, `type_hl_ad` # Degree of hearing loss | `degree_hl` # Technology | `tech_as`, `tech_ad` # Gender | `male` # Race | `race` # Primary language spoken | `prim_lang` # Number of children in home | `sibs` # Age of hearing loss diagnosis | `onset_1` # Age first amplified | `age_amp` # Age of intervention | `age_int` # Age enrolled in OPTION | `age` # Length of time in OPTION | `time` # Known cause of hearing loss | `med_cause=0` # Known syndrome | `synd_cause=0` # Another diagnosed disability| `etiology` # Another disability impacts ability to learn | `etiology_2` # Other services outside of OPTION | `otherserv` # How will we asses this? # * Linear model with score as outcome and test type as predictor, controlling for predictors noted above # * If a student took both tests we could asses how well one test score predicts the other # Statistical considerations: # * Many covariates are listed above. Asses sample size to determine degrees of freedom. # * How to handle students with multiple observations? # - For now will take last observation # * How to handle students with both tests? # - For now only looking at students who took one test # - Could try to predict Arizona score form Goldman-Fristoe and vice versa # # # Data Prep # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np CHRIS = False # In[2]: # %pwd # In[3]: if CHRIS: print("change path to demographics file") else: get_ipython().run_line_magic('run', "'/Users/alicetoll/Documents/OPTION/LSL-DR/Demographics.ipynb'") # In[4]: if CHRIS: demographic = pd.read_csv('demographic.csv') else: demographic = pd.read_pickle('demographic.pkl') # In[5]: covar_fields = ['study_id', 'redcap_event_name', 'male', 'race', 'prim_lang', 'sib', 'onset_1', 'age_amp', 'age_int', 'age', 'time', 'med_cause', 'synd_cause', 'etiology', 'etiology_2', 'otherserv', 'type_hl_ad', 'type_hl_as'] covar = demographic[covar_fields] # Need to create vars and add: # - type of hearing loss # - degree of hearing loss # - technology # - CI Manufacturer # Do we want to collapse race? # Age is in articulation_fields # - age_test_aaps = Arizona # - age_test_gf2 = Goldman-Fristoe # In[6]: # Type of hearing loss # - Bilateral SNHL # - Unilateral SNHL # - Bilateral Auditory Neuropathy # - Unilateral Auditory Neuropathy covar['bilateral_snhl'] = 0 covar.loc[(covar.type_hl_ad == 0) & (covar.type_hl_as == 0), 'bilateral_snhl'] = 1 covar['unilateral_snhl'] = 0 covar.loc[(covar.type_hl_ad == 0) & (covar.type_hl_as == 4), 'unilateral_snhl'] = 1 covar.loc[(covar.type_hl_ad == 4) & (covar.type_hl_as == 0), 'unilateral_snhl'] = 1 covar['bilateral_an'] = 0 covar.loc[(covar.type_hl_ad == 3) & (covar.type_hl_as == 3), 'bilateral_an'] = 1 covar['unilateral_an'] = 0 covar.loc[(covar.type_hl_ad == 0) & (covar.type_hl_as == 3), 'unilateral_an'] = 1 covar.loc[(covar.type_hl_ad == 3) & (covar.type_hl_as == 0), 'unilateral_an'] = 1 # In[7]: # - Bilateral SNHL # - Unilateral SNHL # - Bilateral Auditory Neuropathy # - Unilateral Auditory Neuropathy # In[8]: bi_snhl = covar.bilateral_snhl == 1 uni_snhl = covar.unilateral_snhl == 1 bi_an = covar.bilateral_an == 1 uni_an = covar.unilateral_an == 1 covar.loc[bi_snhl, 'type_hl'] = "bi_snhl" covar.loc[uni_snhl, 'type_hl'] = "uni_snhl" covar.loc[bi_an, 'type_hl'] = "bi_an" covar.loc[uni_an, 'type_hl'] = "uni_an" # In[9]: articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss'] articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None, 'na_values':[999, 9999]}) # In[10]: # articulation.head() # In[11]: # create indicator variable if a student took either test articulation.loc[articulation.aaps_ss.notnull() | articulation.gf2_ss.notnull(), 'test'] = 1 # drop observations when neither test was taken temp = articulation.dropna(subset = ['test']) # Can drop test variable temp = temp.drop('test', 1) # Create new variable if student took both tests in one observation temp['both'] = 0 temp.loc[temp.aaps_ss.notnull() & temp.gf2_ss.notnull(), 'both'] = 1 print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one # temp.head() # Merge covariates and articulation data frames # In[12]: # merge temp2 = pd.merge(temp, covar, on=['study_id', 'redcap_event_name']) # In[13]: AAPS = temp2.aaps_ss.notnull() GF2 = temp2.gf2_ss.notnull() temp2.loc[AAPS, "test_name"] = "AAPS" temp2.loc[GF2, "test_name"] = "GF2" # Create datasets which contain students who either took one or both tests in a single year. If a student has multiple observations, take the last observation in each dataset. # In[14]: # One test single = temp2.loc[temp2.both==0,] a = single.shape[0] single = single.groupby('study_id').last() b = single.shape[0] print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students') # Both tests both = temp2.loc[temp2.both==1,] a = both.shape[0] both = both.groupby('study_id').last() b = both.shape[0] print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students') # In[15]: del temp del temp2 # # # Students who only took one test (within a single year) # In[16]: # Create variables to hold test score and age, and indicator for test name AAPS = single.aaps_ss.notnull() GF2 = single.gf2_ss.notnull() single.loc[AAPS, "test_name"] = "AAPS" single.loc[GF2, "test_name"] = "GF2" single.loc[AAPS, "score"] = single.aaps_ss single.loc[GF2, "score"] = single.gf2_ss single.loc[AAPS, "age_test"] = single.age_test_aaps single.loc[GF2, "age_test"] = single.age_test_gf2 # temp.head() # single = single.drop(['age_test_aaps', 'aaps_ss', 'age_test_gf2', 'gf2_ss', 'both'], 1) # artShort1.head() # In[17]: # Number of observations single.shape[0] # In[18]: # Missingness len(single.index)-single.count() # ## Plots # In[19]: get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns # sns.set(style="white", color_codes=True) # In[20]: width = 3 sns.kdeplot(np.array(single[single.test_name == 'GF2'].score), bw=width, label = "Goldman-Fristoe") sns.kdeplot(np.array(single[single.test_name == 'AAPS'].score), bw=width, label = "Arizona") # In[21]: single.type_hl.value_counts() # We may want to consider looking at just bilateral SNHL vs other based on sample size. # In[22]: # Kernal Density Estimates broken out by type of hearing loss plt.figure() # Bilateral SNHL plt.subplot(2,2,1) sns.kdeplot(np.array(single.loc[(single.test_name == 'GF2') & (single.bilateral_snhl == 1), 'score']), bw=width) sns.kdeplot(np.array(single.loc[(single.test_name == 'AAPS') & (single.bilateral_snhl == 1), 'score']), bw=width) plt.title('Bilateral SNHL') plt.ylabel('Density') plt.xlabel('Standardized Score') # Unilateral SNHL plt.subplot(2,2,2) sns.kdeplot(np.array(single.loc[(single.test_name == 'GF2') & (single.unilateral_snhl == 1), 'score']), bw=width) sns.kdeplot(np.array(single.loc[(single.test_name == 'AAPS') & (single.unilateral_snhl == 1), 'score']), bw=width) plt.title('Unilateral SNHL') plt.ylabel('Density') plt.xlabel('Standardized Score') # Bilateral AN plt.subplot(2,2,3) sns.kdeplot(np.array(single.loc[(single.test_name == 'GF2') & (single.bilateral_an == 1), 'score']), bw=width, label = "Goldman-Fristoe") sns.kdeplot(np.array(single.loc[(single.test_name == 'AAPS') & (single.bilateral_an == 1), 'score']), bw=width, label = "Arizona") plt.title('Bilateral Auditory Neuropathy') plt.ylabel('Density') plt.xlabel('Standardized Score') # Unilateral AN plt.subplot(2,2,4) sns.kdeplot(np.array(single.loc[(single.test_name == 'GF2') & (single.unilateral_an == 1), 'score']), bw=width) # no obs for Arizona with Unilateral AN # sns.kdeplot(np.array(single.loc[(single.test_name == 'AAPS') & (single.unilateral_an == 1), 'score']), # bw=width, label = "Arizona") plt.title('Unilateral Auditory Neuropathy') plt.ylabel('Density') plt.xlabel('Standardized Score') plt.tight_layout() # ### Want to make this function def mykdeplot(df, var, width): sns.kdeplot(np.array(getattr(df[df.Group == 'a'], var)), bw=width, label = "Group A") sns.kdeplot(np.array(getattr(df[df.Group == 'b'], var)), bw=width, label = "Group B")def mykdeplot(df, var, width): sns.kdeplot(np.array(getattr(df[(artShort1.test_name == 'GF2') & (df.var == 1)], 'score')), bw=width, label = "Goldman-Fristoe") sns.kdeplot(np.array(getattr(artShort1[(df.test_name == 'AAPS') & (df.var == 1)], 'score')), bw=width, label = "Arizona")mykdeplot(single, 'bilateral_an', 5) # ## Linear Model # https://github.com/justmarkham/DAT4/blob/master/notebooks/08_linear_regression.ipynb # this is the standard import if you're using "formula notation" (similar to R) import statsmodels.formula.api as smf # create a fitted model in one line lm = smf.ols(formula='score ~ test_name + age_test + male', data=artShort1).fit() # print the coefficients print(lm.params) # print the confidence intervals for the model coefficients print(lm.conf_int()) # In[23]: import statsmodels.formula.api as smf # Linear model predicting score from test name, age, gender, and type of hearing loss. # In[24]: lm = smf.ols(formula="""score ~ test_name + age_test + male + bilateral_snhl + unilateral_snhl + bilateral_an + unilateral_an""", data=single).fit() lm.summary() # From the output above, we see `test_name` is not significant (p = 0.334). This tells us that after accounting for the student's age when taking the test, gender, and type of hearing loss, knowing which test the student took does not help us to predict there score. This suggests that similar students score the same regardless of which test they took. # # We need to determine which variables should be included in our model. # # # Students who only took both tests (within a single year) # ## Introduction # We currently treat standardized Arizona and Goldman-Fristoe tests scores the same. The goal of this analysis is to relate the two tests so they can be transformed for future modeling purposes. # # We only have 55 observations (before accounting for missingness). That only leaves us with 3-6 degress of freedom for modeling. A continuous variable will use up one degree of freedom, and the number of deggrees of freedom used by a categorical variable is one less than the number of categories. # # My initial intuition was to model Arizona score as a function of Goldman-Fristoe score, age, type of hearing loss, and whether or not the student receives other learning servies. Unfortunatly on this small sample size the observations for type of hearing loss and other serivces are rather homongenous, so I do not believe they will be very helpful in predicted Arizona score. # # We may want to allow for nonlinearity by including a spline term for the Goldman-Fristoe score. A spline with 3 knots would use 3 degrees of freedom. # # The candidate models considered are sumarized in the subsection titled **Models**. # # Variables that were eliminated from consideration due to homogenity # * `type_hl` (47 out of 55 students have bilateral SNHL) # * `etiology` (42 out of 55 have another diagnosed disability) # * `otherserv` (35 out of 55 do not recieve other educational services, 12 out of 55 are missing) # **Response: Arizona scores (`aaps_ss`)** # # Considered Covariate | Redcap Variable | Degrees of Freedom # ----------|-------------------------------------- # Goldman Fristoe Score | gf2_ss | 1 # Goldman Fristoe Score, spline | gf2_ss | 3 # Age (Arizona) | `age_test_aaps` | 1 # Age (Goldman Firstoe) | `age_test_gf2` | 1 # Gender | `male` | 1 # Race (White/Non-White) | `white` | 1 # Age of hearing loss diagnosis | `onset_1` | 1 # Length of time in OPTION | `time` | 1 # Another disability impacts ability to learn | `etiology_impact` | 1 # ## Potential variables for modelings # In[25]: # With 55 observations (before missingness), we really only have 3-6 degrees of freedom to use in a model. both.shape # In[26]: # Arizona is more conservative towards "normal" (85) # May want to consider a spline for Goldman-Fristoe width = 4 sns.kdeplot(np.array(both.gf2_ss), bw=width, label = "Goldman-Fristoe") sns.kdeplot(np.array(both.aaps_ss), bw=width, label = "Arizona") plt.ylabel('Density') plt.xlabel('Standardized Score') # In[27]: # Large age range (in months). May want to use a spline for age # Don't have enough degrees of freedom to spline both variables sns.kdeplot(both.age_test_gf2, bw=5, label = "Goldman-Fristoe") sns.kdeplot(both.age_test_aaps, bw=5, label = "Arizona") plt.ylabel('Density') plt.xlabel('Age (months)') # In[28]: # Race has a lot of categories # To preserve degrees of freedom we can reduce to white/non-white both.race.value_counts(dropna=False, sort=False) # 0 = Cacucasian # 1 = Black or African American # 2 = Hispanic or Latino # 3 = Asian # 4 = Native Hawaiians/Pacific Islanders # 5 = American Indian/Alaska Natives # 6 = Multiracial # 7 = Unknown # 8 = Other # In[29]: both['white'] = np.nan both.loc[(both.race == 0), 'white'] = 1 both.loc[(both.race !=0) & (pd.notnull(both.race)), 'white'] = 0 both.white.value_counts(dropna=False, sort=False) # In[30]: # Etiology 2: impact of the other condition(s) that contribute to the child's overall learning abilities # To preserve degrees of freedom we can reduce to impacts (1, 2, 3) and doesn't imact (0, 4) both.etiology_2.value_counts(dropna=False, sort=False) # 0 = None # 1 = Mild # 2 = Moderate # 3 = Severe # 4 = N/A (no additional disability suspected or identified) # In[31]: both['etiology_impact'] = np.nan both.loc[(both.etiology_2 == 0) | (both.etiology_2 == 4), 'etiology_impact'] = 0 both.loc[(both.etiology_2 == 1) | (both.etiology_2 == 2) | (both.etiology_2 == 3), 'etiology_impact'] = 1 both.etiology_impact.value_counts(dropna=False) # In[32]: # Amount of time enrolled in OPTION: 2 missing obs len(both.time.index)-both.time.count() # In[33]: # Age of onset: 10 missing obs # This is a lot of observations to sacrifice # Would be good if we could use multiple imputation len(both.onset_1.index)-both.onset_1.count() # ## Variables eliminated due to heterogeneity # In[34]: # Most of the students who took both tests have bilateral SNHL # 4 observations are missing # Because a large majority of the students only have bilateral SNHL, it will not be a useful predictor. both.type_hl.value_counts(dropna=False) # In[35]: # 42 out of 55 do not have another diagnosed disability # note: this doesn't agree with etiology_2 coding both.etiology.value_counts(dropna=False, sort=False) # In[36]: # Other services: 12 missing obs # Most students do not recieve other services both.otherserv.value_counts(dropna=False, sort=False) # 0 = Yes # 1 = No # 2 = Unknown # ## Models # Model Name | Variables | n obs | df |R-squared # -----------|-----------|-------|----|------------ # m1 | gf2_ss + age_test_gf2 + age_test_aaps + male + white + etiology_impact | 52 | 6 | 0.782 # m2 | gf2_ss + age_test_gf2 + age_test_aaps | 55 | 3 | 0.771 # m3 | bs(gf2_ss, 3) + age_test_gf2 + age_test_aaps | 55 | 5 | 0.781 # m4 | cr(gf2_ss, df=3, constraints='center') + age_test_gf2 + age_test_aaps | 55 | 5 | 0.780 # m5 | gf2_ss + age_test_gf2 + age_test_aaps + time + onset_1 | 45 | 5 | 0.877 # m6 | gf2_ss + age_test_gf2 + age_test_aaps + time | 52 | 4 | 0.819 # In[37]: m1 = smf.ols(formula="""aaps_ss ~ gf2_ss + age_test_gf2 + age_test_aaps + male + white + etiology_impact""", data=both).fit() m1.summary() # In[38]: m2 = smf.ols(formula="""aaps_ss ~ gf2_ss + age_test_gf2 + age_test_aaps""", data=both).fit() m2.summary() # In[39]: # Use to print anova tables (for when we use splines) import statsmodels.api as sm # In[40]: # bs() B-spline m3 = smf.ols(formula="""aaps_ss ~ bs(gf2_ss, 3) + age_test_gf2 + age_test_aaps""", data=both).fit() m3.summary() # In[41]: sm.stats.anova_lm(m3, typ=2) # In[42]: # cr() natural cubic spline m4 = smf.ols(formula="""aaps_ss ~ cr(gf2_ss, df=3, constraints='center') + age_test_gf2 + age_test_aaps""", data=both).fit() m4.summary() # In[43]: sm.stats.anova_lm(m4, typ=2) # In[44]: m5 = smf.ols(formula="""aaps_ss ~ gf2_ss + age_test_gf2 + age_test_aaps + time + onset_1""", data=both).fit() m5.summary() # In[45]: m6 = smf.ols(formula="""aaps_ss ~ gf2_ss + age_test_gf2 + age_test_aaps + time""", data=both).fit() m6.summary() # ## Plots # Seaborn Jointplot: https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.jointplot.html # In[46]: import scipy # (correlation, p-value) pc = scipy.stats.pearsonr(both.gf2_ss, both.aaps_ss) print("Peasron Corelation: " "%.3f" % pc[0]) sc = scipy.stats.spearmanr(both.gf2_ss, both.aaps_ss) print("Spearman Corelation: " "%.3f" % sc[0]) # In[47]: sns.lmplot("gf2_ss", "aaps_ss", data=both, fit_reg=False) sns.regplot(both.gf2_ss, both.aaps_ss, scatter=False, lowess=True, label='LOWESS') plt.plot([50, 110], [50, 110], 'red', label='Identity'); legend = plt.legend(loc='upper left') # In[48]: # pearson correlation g = sns.jointplot(x="gf2_ss", y="aaps_ss", data=both, marginal_kws=dict(bins=15, rug=True)) # for spearman correlation, add 'stat_func=spearmanr' to joinplot() x0, x1 = g.ax_joint.get_xlim() y0, y1 = g.ax_joint.get_ylim() lims = [max(x0, y0)-10, min(x1, y1)+10] g.ax_joint.plot(lims, lims, ':k') # In[49]: # regression line with confidence bands sns.jointplot(x="gf2_ss", y="aaps_ss", data=both, kind="reg", marginal_kws=dict(bins=15, rug=True)); # In[50]: fig = plt.figure(figsize=(12,15)) fig = sm.graphics.plot_regress_exog(m1, 'gf2_ss', fig=fig) # In[51]: fig, ax = plt.subplots(figsize=(8,6)) ax.plot(np.array(both.aaps_ss), m3.fittedvalues, 'r.') plt.plot([65, 105], [65, 105], 'blue', label='Identity') legend = plt.legend(loc='upper left') plt.xlabel('Observed Arizona Score') plt.ylabel('Predicted Arizona Score') plt.title('Model 3') # In[52]: fig, ax = plt.subplots(figsize=(8,6)) ax.plot(np.array(both.gf2_ss), m3.fittedvalues, 'r.', label='Predicted') ax.plot(np.array(both.gf2_ss), np.array(both.aaps_ss), 'g.', label='Observed') plt.plot([35, 110], [35, 110], 'blue', label='Identity'); legend = plt.legend(loc='upper left') plt.xlabel('Goldman-Fristoe Score') plt.ylabel('Arizona Score') plt.title('Model 3') ax.set_xlim([30,120]) ax.set_ylim([60,110]) # In[53]: aaps_m6 = both[both['time'].notnull()].aaps_ss gf2_m6 = both[both['time'].notnull()].gf2_ss # In[54]: fig, ax = plt.subplots(figsize=(8,6)) ax.plot(aaps_m6, m6.fittedvalues, 'r.') plt.plot([65, 105], [65, 105], 'blue', label='Identity') legend = plt.legend(loc='upper left') plt.xlabel('Observed Arizona Score') plt.ylabel('Predicted Arizona Score') plt.title('Model 6') # In[55]: fig, ax = plt.subplots(figsize=(8,6)) ax.plot(gf2_m6, m6.fittedvalues, 'r.', label='Predicted') ax.plot(gf2_m6, aaps_m6, 'g.', label='Observed') plt.plot([35, 110], [35, 110], 'blue', label='Identity'); legend = plt.legend(loc='upper left') plt.xlabel('Goldman-Fristoe Score') plt.ylabel('Arizona Score') plt.title('Model 6') ax.set_xlim([30,120]) ax.set_ylim([60,110]) # ## Future Considerations # In an effort to increase sample size, we could model all observations with both scores (73) and use clustered robust standard errors or build a repeated measures model. # # For robust standard errors see http://statsmodels.sourceforge.net/0.6.0/stats.html # `sandwich_covariance.cov_cluster(results, group)`