%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
from pylab import *
# kernel density estimation
from scipy.stats import gaussian_kde
Apply grayscale first, seaborn whitegrid second to get greyscale plots.
grayscale = True
if grayscale:
plt.style.use('grayscale')
print(style.available)
['seaborn-darkgrid', 'seaborn-whitegrid', 'ggplot', 'seaborn-pastel', 'seaborn-poster', 'seaborn-deep', 'seaborn-notebook', 'seaborn-white', 'seaborn-dark-palette', 'seaborn-paper', 'bmh', 'seaborn-dark', 'seaborn-colorblind', 'dark_background', 'grayscale', 'classic', 'seaborn-ticks', 'seaborn-talk', 'fivethirtyeight', 'seaborn-bright', 'seaborn-muted']
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")
CHRIS = True
from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
if not CHRIS:
api_key = open("/Users/alicetoll/Documents/OPTION/token.txt").read()
else:
api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()
lsl_dr_project = Project(api_url, api_key)
articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
'owls_lc_ss','owls_oe_ss','age_test_owls',
'celfp_rl_ss','celfp_el_ss','age_test_celp',
'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[999, 9999]})
demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df',
df_kwargs={'index_col':None,
'na_values':[888, 999, 9999]})
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning: Columns (25,31,41) have mixed types. Specify dtype option on import or set low_memory=False. if self.run_code(code, result):
5 language measures:
# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()
language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls
language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()
language1["test_type"] = "receptive"
language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"
language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss
language2["test_type"] = "expressive"
language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"
language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss
language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))
test_type expressive receptive test_name CELF-4 659 565 CELF-P2 1722 1728 OWLS 1259 1267 PLS 4072 4083 There are 0 null values for score
language["school"] = language.study_id.str.slice(0,4)
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language |
language['ageGroup'] = None # initial variable to none
language.loc[(language.age_test >= 36) & (language.age_test < 48), 'ageGroup'] = 3
language.loc[(language.age_test >= 48) & (language.age_test < 60), 'ageGroup'] = 4
language.loc[(language.age_test >= 60) & (language.age_test < 72), 'ageGroup'] = 5
language.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | ageGroup | |
---|---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language | 4 |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language | None |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language | 3 |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language | 4 |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language | 5 |
if CHRIS:
!mkdir DescriptiveFigures
mkdir: DescriptiveFigures: File exists
expressive_lang = language.loc[language.test_type=='expressive'].copy()
expressive_lang.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | ageGroup | |
---|---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 60 | expressive | PLS | 0101 | 54 | Language | 4 |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 82 | expressive | OWLS | 0101 | 113 | Language | None |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 68 | expressive | PLS | 0101 | 44 | Language | 3 |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | expressive | PLS | 0101 | 54 | Language | 4 |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 85 | expressive | CELF-P2 | 0101 | 68 | Language | 5 |
expressive_scores = (expressive_lang.groupby(['study_id','ageGroup'])
.apply(lambda x: x.pivot(columns='test_name', values='score')))
expressive_scores = (expressive_scores.set_index(expressive_scores.index.droplevel(2))
.reset_index()
.groupby(['study_id','ageGroup'])).apply(max)
expressive_owls_pls = expressive_scores[['OWLS', 'PLS']].dropna()
expressive_owls_pls.plot.scatter('OWLS', 'PLS')
<matplotlib.axes._subplots.AxesSubplot at 0x111e49908>
from sklearn import linear_model
reg_owls = linear_model.LinearRegression()
reg_owls.fit(expressive_owls_pls.OWLS.values.reshape(-1,1), expressive_owls_pls.PLS.values)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
expressive_lang.test_name.value_counts()
PLS 4072 CELF-P2 1722 OWLS 1259 CELF-4 659 Name: test_name, dtype: int64
expressive_lang['old_score'] = expressive_lang.score.copy()
pred_vals = reg_owls.predict(expressive_lang[expressive_lang.test_name=='OWLS'].score.values.reshape(-1,1))
expressive_lang.loc[expressive_lang.test_name=='OWLS', 'score'] = pred_vals
expressive_celf_pls = expressive_scores[['CELF-P2', 'PLS']].dropna()
expressive_celf_pls.plot.scatter('CELF-P2', 'PLS')
<matplotlib.axes._subplots.AxesSubplot at 0x1134b3fd0>
reg_celf = linear_model.LinearRegression()
reg_celf.fit(expressive_celf_pls['CELF-P2'].values.reshape(-1,1), expressive_celf_pls.PLS.values)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
pred_vals = reg_celf.predict(expressive_lang[expressive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1))
expressive_lang.loc[expressive_lang.test_name=='CELF-P2', 'score'] = pred_vals
expressive_scores[['CELF-4', 'PLS']].dropna().plot.scatter('CELF-4', 'PLS')
<matplotlib.axes._subplots.AxesSubplot at 0x1122b26a0>
There arent enough points to fit a model for CELF-2, so we will combine these directly.
# Test type
expressive_lang["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 5728 Arizonia 525 Arizonia and Goldman 91 Name: test_type, dtype: int64 There are 0 null values for test_type
plt.style.use('grayscale')
sns.set_style("whitegrid")
bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Language')
for i in [1,2,3]:
y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expLang.png', dpi=300)
expressive_lang.ageGroup.value_counts()
# expressive_lang.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])
3 1590 4 1572 5 1118 Name: ageGroup, dtype: int64
bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Language')
for i in [1,2,3]:
y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expLang.png', dpi=300)
# need to do this for some reason...
expressive_lang['scoreInt'] = np.array(expressive_lang.score, dtype = 'int')
expressive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len])
mean | median | std | len | |
---|---|---|---|---|
ageGroup | ||||
3 | 82.952201 | 82 | 15.582549 | 1590 |
4 | 81.650127 | 81 | 16.563240 | 1572 |
5 | 80.762970 | 80 | 18.071186 | 1118 |
receptive_lang = language.loc[language.test_type=='receptive'].copy()
receptive_lang.head()
study_id | redcap_event_name | score | test_type | test_name | school | age_test | domain | ageGroup | |
---|---|---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 51 | receptive | PLS | 0101 | 54 | Language | 4 |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 61 | receptive | OWLS | 0101 | 113 | Language | None |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 55 | receptive | PLS | 0101 | 44 | Language | 3 |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 77 | receptive | PLS | 0101 | 54 | Language | 4 |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 93 | receptive | CELF-P2 | 0101 | 68 | Language | 5 |
receptive_lang.test_name.value_counts()
PLS 4083 CELF-P2 1728 OWLS 1267 CELF-4 565 Name: test_name, dtype: int64
receptive_scores = (receptive_lang.groupby(['study_id','ageGroup'])
.apply(lambda x: x.pivot(columns='test_name', values='score')))
receptive_scores = (receptive_scores.set_index(receptive_scores.index.droplevel(2))
.reset_index()
.groupby(['study_id','ageGroup'])).apply(max)
receptive_owls_pls = receptive_scores[['OWLS', 'PLS']].dropna()
receptive_owls_pls.plot.scatter('OWLS', 'PLS')
<matplotlib.axes._subplots.AxesSubplot at 0x1157340b8>
receptive_celfp2_pls = receptive_scores[['CELF-P2', 'PLS']].dropna()
receptive_celfp2_pls.plot.scatter('CELF-P2', 'PLS')
<matplotlib.axes._subplots.AxesSubplot at 0x113525668>
receptive_celf4_pls = receptive_scores[['CELF-4', 'PLS']].dropna()
receptive_celf4_pls.plot.scatter('CELF-4', 'PLS')
<matplotlib.axes._subplots.AxesSubplot at 0x115449828>
reg_owls = linear_model.LinearRegression()
reg_owls.fit(receptive_owls_pls.OWLS.values.reshape(-1,1), receptive_owls_pls.PLS.values)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
receptive_lang['old_score'] = receptive_lang.score.copy()
pred_vals = reg_owls.predict(receptive_lang[receptive_lang.test_name=='OWLS'].score.values.reshape(-1,1))
receptive_lang.loc[receptive_lang.test_name=='OWLS', 'score'] = pred_vals
reg_celf = linear_model.LinearRegression()
reg_celf.fit(receptive_celfp2_pls['CELF-P2'].values.reshape(-1,1), receptive_celfp2_pls.PLS.values)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
pred_vals = reg_celf.predict(receptive_lang[receptive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1))
receptive_lang.loc[receptive_lang.test_name=='CELF-P2', 'score'] = pred_vals
receptive_lang['scoreInt'] = np.array(receptive_lang.score, dtype = 'int')
bp = receptive_lang.boxplot(column='scoreInt', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Receptive Language')
for i in [1,2,3]:
y = receptive_lang.score[receptive_lang.ageGroup==i+2].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/recLang.png', dpi=300)
receptive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len])
mean | median | std | len | |
---|---|---|---|---|
ageGroup | ||||
3 | 83.541509 | 84 | 18.573121 | 1590 |
4 | 83.525649 | 84 | 17.976144 | 1579 |
5 | 82.148858 | 81 | 17.876704 | 1095 |
# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"
print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))
# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]
Goldman 5728 Arizonia 525 Arizonia and Goldman 91 Name: test_type, dtype: int64 There are 0 null values for test_type
# create indicator variable if a student took either test
articulation.loc[articulation.aaps_ss.notnull() | articulation.gf2_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = articulation.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.aaps_ss.notnull() & temp.gf2_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()
0 6253 1 91 Name: both, dtype: int64
AAPS = temp.aaps_ss.notnull()
GF2 = temp.gf2_ss.notnull()
temp.loc[AAPS, "test_name"] = "AAPS"
temp.loc[GF2, "test_name"] = "GF2"
# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')
# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')
We have 6253 observations where a student took one test in a single year, but only 2932 unique students We have 91 observations where a student took both test in a single year, but only 62 unique students
reg = linear_model.LinearRegression()
reg.fit(both.aaps_ss.values.reshape(-1,1), both.gf2_ss.values)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
articulation['old_score'] = articulation.score.copy()
pred_vals = reg.predict(articulation[articulation.test_type=='Arizonia'].score.values.reshape(-1,1))
articulation.loc[articulation.test_type=='Arizonia', 'score'] = pred_vals
articulation["school"] = articulation.study_id.str.slice(0,4)
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())
count 6340.000000 mean 68.697950 std 30.557327 min 23.000000 25% 47.000000 50% 60.000000 75% 81.000000 max 243.000000 Name: age_test, dtype: float64
articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()
study_id | redcap_event_name | test_type | score | test | old_score | school | age_test | domain | |
---|---|---|---|---|---|---|---|---|---|
1 | 0101-2002-0101 | year_1_complete_71_arm_1 | Goldman | 78.0 | 1.0 | 78.0 | 0101 | 80.0 | Articulation |
9 | 0101-2003-0102 | initial_assessment_arm_1 | Goldman | 72.0 | 1.0 | 72.0 | 0101 | 44.0 | Articulation |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | Goldman | 97.0 | 1.0 | 97.0 | 0101 | 54.0 | Articulation |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | Goldman | 75.0 | 1.0 | 75.0 | 0101 | 53.0 | Articulation |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | Goldman | 80.0 | 1.0 | 80.0 | 0101 | 66.0 | Articulation |
articulation['ageGroup'] = None # initial variable to none
articulation.loc[(articulation.age_test >= 36) & (articulation.age_test < 48), 'ageGroup'] = 3
articulation.loc[(articulation.age_test >= 48) & (articulation.age_test < 60), 'ageGroup'] = 4
articulation.loc[(articulation.age_test >= 60) & (articulation.age_test < 72), 'ageGroup'] = 5
bp = articulation.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Articulation')
for i in [1,2,3]:
y = articulation.score[articulation.ageGroup==i+2].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/artic.png', dpi=300)
articulation.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])
mean | median | std | len | |
---|---|---|---|---|
ageGroup | ||||
3 | 83.422264 | 84.0 | 18.680816 | 1281.0 |
4 | 82.868699 | 85.0 | 21.087332 | 1489.0 |
5 | 82.219630 | 86.0 | 21.043017 | 1131.0 |
# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))
expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]
There are 0 null values for test_type
expressive.test_type.value_counts()
EVT 4271 EOWPVT 3087 EOWPVT and EVT 176 Name: test_type, dtype: int64
# create indicator variable if a student took either test
expressive.loc[expressive.evt_ss.notnull() | expressive.eowpvt_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = expressive.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.evt_ss.notnull() & temp.eowpvt_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()
0 7358 1 176 Name: both, dtype: int64
EVT = temp.evt_ss.notnull()
EOWPVT = temp.eowpvt_ss.notnull()
temp.loc[EVT, "test_name"] = "EVT"
temp.loc[EOWPVT, "test_name"] = "EOWPVT"
# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')
# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')
We have 7358 observations where a student took one test in a single year, but only 3333 unique students We have 176 observations where a student took both test in a single year, but only 127 unique students
del temp
reg = linear_model.LinearRegression()
reg.fit(both.evt_ss.values.reshape(-1,1), both.eowpvt_ss.values)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
expressive['old_score'] = expressive.score.copy()
pred_vals = reg.predict(expressive[expressive.test_type=='EVT'].score.values.reshape(-1,1))
expressive.loc[expressive.test_type=='EVT', 'score'] = pred_vals
expressive["school"] = expressive.study_id.str.slice(0,4)
expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]
expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
0 | 0101-2002-0101 | initial_assessment_arm_1 | 58.0 | EOWPVT | 0101 | 54.0 | Expressive Vocabulary |
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 84.0 | EOWPVT | 0101 | 80.0 | Expressive Vocabulary |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 90.0 | EOWPVT | 0101 | 113.0 | Expressive Vocabulary |
14 | 0101-2004-0101 | year_2_complete_71_arm_1 | 90.0 | EOWPVT | 0101 | 53.0 | Expressive Vocabulary |
15 | 0101-2004-0101 | year_3_complete_71_arm_1 | 87.0 | EOWPVT | 0101 | 66.0 | Expressive Vocabulary |
expressive['ageGroup'] = None # initial variable to none
expressive.loc[(expressive.age_test >= 36) & (expressive.age_test < 48), 'ageGroup'] = 3
expressive.loc[(expressive.age_test >= 48) & (expressive.age_test < 60), 'ageGroup'] = 4
expressive.loc[(expressive.age_test >= 60) & (expressive.age_test < 72), 'ageGroup'] = 5
bp = expressive.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Vocabulary')
for i in [1,2,3]:
y = expressive.score[expressive.ageGroup==i+2].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expVocab.png', dpi=300)
expressive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])
mean | median | std | len | |
---|---|---|---|---|
ageGroup | ||||
3 | 90.898049 | 92.770977 | 18.628653 | 1542.0 |
4 | 90.023118 | 91.000000 | 18.967224 | 1726.0 |
5 | 89.124085 | 90.000000 | 17.289723 | 1283.0 |
receptive.columns
Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss', 'age_test_rowpvt', 'rowpvt_ss'], dtype='object')
# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))
receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]
There are 0 null values for test_type
# create indicator variable if a student took either test
receptive.loc[receptive.ppvt_ss.notnull() | receptive.rowpvt_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = receptive.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.ppvt_ss.notnull() & temp.rowpvt_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()
0 7497 1 228 Name: both, dtype: int64
PPVT = temp.ppvt_ss.notnull()
ROWPVT = temp.rowpvt_ss.notnull()
temp.loc[PPVT, "test_name"] = "PPVT"
temp.loc[ROWPVT, "test_name"] = "ROWPVT"
# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')
# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')
We have 7497 observations where a student took one test in a single year, but only 3364 unique students We have 228 observations where a student took both test in a single year, but only 157 unique students
del temp
rv_reg = linear_model.LinearRegression()
rv_reg.fit(both.ppvt_ss.values.reshape(-1,1), both.rowpvt_ss.values)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
receptive['old_score'] = receptive.score.copy()
pred_vals = reg.predict(receptive[receptive.test_type=='PPVT'].score.values.reshape(-1,1))
receptive.loc[receptive.test_type=='PPVT', 'score'] = pred_vals
receptive["school"] = receptive.study_id.str.slice(0,4)
receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]
print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))
There are 23 null values for age_test
receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()
study_id | redcap_event_name | score | test_type | school | age_test | domain | |
---|---|---|---|---|---|---|---|
2 | 0101-2002-0101 | year_2_complete_71_arm_1 | 86.408510 | PPVT | 0101 | 80.0 | Receptive Vocabulary |
5 | 0101-2002-0101 | year_5_complete_71_arm_1 | 101.000000 | ROWPVT | 0101 | 113.0 | Receptive Vocabulary |
9 | 0101-2003-0102 | initial_assessment_arm_1 | 58.572717 | PPVT | 0101 | 44.0 | Receptive Vocabulary |
10 | 0101-2003-0102 | year_1_complete_71_arm_1 | 78.455426 | PPVT | 0101 | 54.0 | Receptive Vocabulary |
11 | 0101-2003-0102 | year_2_complete_71_arm_1 | 95.156903 | PPVT | 0101 | 68.0 | Receptive Vocabulary |
receptive.study_id.unique().shape
(3404,)
receptive['ageGroup'] = None # initial variable to none
receptive.loc[(receptive.age_test >= 36) & (receptive.age_test < 48), 'ageGroup'] = 3
receptive.loc[(receptive.age_test >= 48) & (receptive.age_test < 60), 'ageGroup'] = 4
receptive.loc[(receptive.age_test >= 60) & (receptive.age_test < 72), 'ageGroup'] = 5
bp = receptive.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Receptive Vocabulary')
for i in [1,2,3]:
y = receptive.score[receptive.ageGroup==i+2].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/recVocab.png', dpi=300)
receptive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])
mean | median | std | len | |
---|---|---|---|---|
ageGroup | ||||
3 | 89.766914 | 90.385052 | 16.611238 | 1603.0 |
4 | 88.455310 | 89.589744 | 17.323336 | 1756.0 |
5 | 87.784735 | 88.794435 | 15.319930 | 1312.0 |
width = 3
sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds")
sns.kdeplot(np.array(receptive[receptive.ageGroup ==4].score), bw=width, label = "4-year-olds")
sns.kdeplot(np.array(receptive[receptive.ageGroup ==5].score), bw=width, label = "5-year-olds")
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
<matplotlib.axes._subplots.AxesSubplot at 0x115c18cc0>
receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=2)
# plt.legend(bbox_to_anchor=(1.05, 1),loc=2, borderaxespad=0.)
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2),
# ncol=3, fancybox=True, shadow=True)
<matplotlib.legend.Legend at 0x1190c5320>
test_scores = pd.concat([articulation, expressive, receptive, language])
test_scores.head()
ageGroup | age_test | domain | old_score | redcap_event_name | school | score | study_id | test | test_name | test_type | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | None | 80 | Articulation | 78.0 | year_1_complete_71_arm_1 | 0101 | 78 | 0101-2002-0101 | 1.0 | NaN | Goldman |
9 | 3 | 44 | Articulation | 72.0 | initial_assessment_arm_1 | 0101 | 72 | 0101-2003-0102 | 1.0 | NaN | Goldman |
10 | 4 | 54 | Articulation | 97.0 | year_1_complete_71_arm_1 | 0101 | 97 | 0101-2003-0102 | 1.0 | NaN | Goldman |
14 | 4 | 53 | Articulation | 75.0 | year_2_complete_71_arm_1 | 0101 | 75 | 0101-2004-0101 | 1.0 | NaN | Goldman |
15 | 5 | 66 | Articulation | 80.0 | year_3_complete_71_arm_1 | 0101 | 80 | 0101-2004-0101 | 1.0 | NaN | Goldman |
print(test_scores.test_type.value_counts())
print(test_scores.domain.value_counts())
expressive 7712 receptive 7643 Goldman 5728 PPVT 4893 EVT 4271 EOWPVT 3087 ROWPVT 2604 Arizonia 525 PPVT and ROWPVT 228 EOWPVT and EVT 176 Arizonia and Goldman 91 Name: test_type, dtype: int64 Language 15355 Receptive Vocabulary 7725 Expressive Vocabulary 7534 Articulation 6344 Name: domain, dtype: int64
# make a new domain variable that contains only 5 categories
test_scores['domain2'] = test_scores.domain.copy()
test_scores.loc[test_scores.test_type == 'expressive', 'domain2'] = 'Expressive Language'
test_scores.loc[test_scores.test_type == 'receptive', 'domain2'] = 'Receptive Language'
test_scores.domain2.value_counts()
Receptive Vocabulary 7725 Expressive Language 7712 Receptive Language 7643 Expressive Vocabulary 7534 Articulation 6344 Name: domain2, dtype: int64
# make a dataframe that contains only 3, 4, and 5-year-olds
test_scores_345 = test_scores.loc[(test_scores.age_test >= 36) & (test_scores.age_test < 72), ]
test_scores_345.domain2.value_counts()
Receptive Vocabulary 4671 Expressive Vocabulary 4551 Expressive Language 4280 Receptive Language 4264 Articulation 3901 Name: domain2, dtype: int64
# Attempt to make black and white
bp = test_scores_345.boxplot(column='score', by='domain2', grid=False, sym='')
plt.title('Test scores for 3, 4, and 5-year-olds by functional outcome')
plt.suptitle("")
plt.xlabel(''); plt.ylabel('Standard score');
plt.xticks([1, 2, 3, 4, 5], ['Articulation', 'Expressive\nLanguage', 'Expressive\nVocabulary',
'Receptive\nLanguage', 'Receptive\nVocabulary'])
# Articulation
y = test_scores_345.score[test_scores_345.domain2=='Articulation'].dropna()
x = np.random.normal(1, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)
# Expressive Language
y = test_scores_345.score[test_scores_345.domain2=='Expressive Language'].dropna()
x = np.random.normal(2, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)
# Expressive Vocabulary
y = test_scores_345.score[test_scores_345.domain2=='Expressive Vocabulary'].dropna()
x = np.random.normal(3, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)
# Receptive Language
y = test_scores_345.score[test_scores_345.domain2=='Receptive Language'].dropna()
x = np.random.normal(4, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)
# Receptive Vocabulary
y = test_scores_345.score[test_scores_345.domain2=='Receptive Vocabulary'].dropna()
x = np.random.normal(5, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)
plt.axhline(y=85)
# for components in bp.keys():
# for line in bp[components]:
# line.set_color('black') # black lines
<matplotlib.lines.Line2D at 0x114e37e10>
Need to make in black and white.
Need to make legend
sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=False, hist=True, norm_hist=False);
fig = sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=True, hist=False)
plt.yticks(fig.get_yticks(), fig.get_yticks() * 6735)
plt.ylabel('Counts', fontsize=16)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
<matplotlib.text.Text at 0x11901ce10>
sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds")
plt.ylabel('Density')
plt.yticks([0, 0.01, 0.02]);
# plt.yticks(fig.get_yticks(), fig.get_yticks() * 100)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
def kdeplot(domain, width, score='score'):
sns.kdeplot(np.array(domain.loc[domain.ageGroup ==3, score]), bw=width, label = "3-year-olds")
sns.kdeplot(np.array(domain.loc[domain.ageGroup ==4, score]), bw=width, label = "4-year-olds")
sns.kdeplot(np.array(domain.loc[domain.ageGroup ==5, score]), bw=width, label = "5-year-olds")
plt.ylabel('Density')
kdeplot(expressive_lang, 4)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
expressive_lang[expressive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
expressive_lang[expressive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
expressive_lang[expressive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=1)
<matplotlib.legend.Legend at 0x113fa9d30>
kdeplot(receptive_lang, 4)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
receptive_lang[receptive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive_lang[receptive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
receptive_lang[receptive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=1)
<matplotlib.legend.Legend at 0x115f819e8>
kdeplot(articulation, 4)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
articulation[articulation.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
articulation[articulation.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
articulation[articulation.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=2)
<matplotlib.legend.Legend at 0x11336f358>
kdeplot(expressive, 4)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
expressive[expressive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
expressive[expressive.ageGroup ==4].score.hist(alpha=0.3, label='3-year-olds')
expressive[expressive.ageGroup ==5].score.hist(alpha=0.3, label='3-year-olds')
plt.legend(loc=2)
<matplotlib.legend.Legend at 0x1186c3898>
kdeplot(receptive, 4)
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='3-year-olds')
plt.legend(loc=2)
<matplotlib.legend.Legend at 0x115c3c9b0>
def kdesubplot(domain, width, ax):
sns.kdeplot(np.array(domain[domain.ageGroup ==3].score), bw=width, label = "3-year-olds", ax=ax)
sns.kdeplot(np.array(domain[domain.ageGroup ==4].score), bw=width, label = "4-year-olds", ax=ax)
sns.kdeplot(np.array(domain[domain.ageGroup ==5].score), bw=width, label = "5-year-olds", ax=ax)
plt.ylabel('Density')
domains = [expressive_lang, receptive_lang, expressive, receptive, articulation]
names = ['Expressive language', 'Receptive language', 'Expressive vocabulary', 'Receptive vocabulary', 'Articulation']
fig, axes = plt.subplots(3,2, figsize=(10,6))
for i,ax in enumerate(axes.ravel()[:-1]):
kdesubplot(domains[i], 4, ax)
ax.set_title(names[i])
ax.set_yticks([])
if i==2:
ax.set_ylabel('Density')
if i>2:
ax.set_xlabel('Score')
if i:
ax.legend_.remove()
axes[-1,-1].axis('off')
plt.tight_layout()
# fig.text(0, 0.5, 'Density', va='center', rotation='vertical')
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
plt.figure()
plt.subplot(3,2,1)
kdeplot(expressive_lang, 4)
plt.title('Expressive Language')
plt.subplot(3,2,2)
kdeplot(receptive_lang, 4)
plt.title('Receptive Language')
plt.subplot(3,2,3)
kdeplot(expressive, 4)
plt.title('Expressive Vocabulary')
plt.subplot(3,2,4)
kdeplot(receptive, 4)
plt.title('Receptive Vocabulary')
plt.subplot(3,2,5)
kdeplot(articulation, 4)
plt.title('Articulation')
plt.tight_layout()
/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j