In [17]:

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
from pylab import *

In [18]:

# kernel density estimation
from scipy.stats import gaussian_kde

Apply grayscale first, seaborn whitegrid second to get greyscale plots.

In [19]:

grayscale = True

if grayscale:
    plt.style.use('grayscale')

In [20]:

print(style.available)

['seaborn-darkgrid', 'seaborn-whitegrid', 'ggplot', 'seaborn-pastel', 'seaborn-poster', 'seaborn-deep', 'seaborn-notebook', 'seaborn-white', 'seaborn-dark-palette', 'seaborn-paper', 'bmh', 'seaborn-dark', 'seaborn-colorblind', 'dark_background', 'grayscale', 'classic', 'seaborn-ticks', 'seaborn-talk', 'fivethirtyeight', 'seaborn-bright', 'seaborn-muted']

In [21]:

%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")

In [22]:

CHRIS = True

In [23]:

from redcap import Project
api_url = 'https://redcap.vanderbilt.edu/api/'
if not CHRIS: 
    api_key = open("/Users/alicetoll/Documents/OPTION/token.txt").read()
else:
    api_key = open("/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt").read()

lsl_dr_project = Project(api_url, api_key)

In [24]:

articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']
articulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})

In [25]:

expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']
expressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', 
                                           df_kwargs={'index_col':None,
                                                      'na_values':[999, 9999]})

In [26]:

receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']
receptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', 
                                          df_kwargs={'index_col':None,
                                                     'na_values':[999, 9999]})

In [27]:

language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',
                   'owls_lc_ss','owls_oe_ss','age_test_owls',
                   'celfp_rl_ss','celfp_el_ss','age_test_celp',
                   'celf_elss','celf_rlss','age_test_celf']
language_raw = lsl_dr_project.export_records(fields=language_fields, format='df', 
                                             df_kwargs={'index_col':None, 
                                                        'na_values':[999, 9999]})

In [28]:

demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year',
'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',
'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',
'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',
'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',
'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age',
'att_days_hr', 'att_days_sch', 'att_days_st2_417']
demographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', 
                                            df_kwargs={'index_col':None, 
                                                       'na_values':[888, 999, 9999]})

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning: Columns (25,31,41) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):

Language¶

5 language measures:

3 versions of CELF
PLS
- pls_ac_rs: PLS: Auditory Comprehension Raw Score
- pls_ac_ss: PLS: Auditory Comprehension Standard Score
- pls_ec_rs: PLS: Expressive Communication Raw Score
- pls_ec_ss: PLS: Expressive Communication Standard Score
- pls_tl_rs: PLS: Total Language Score Standard Score Total
- pls_tl_ss: PLS: Total Language Score Standard Score
OWLS
- age_test_owls: Age at time of testing (OWLS)
- owls_lc_rs: OWLS: Listening Comprehension Raw Score
- owls_lc_ss: OWLS: Listening Comprehension Standard Score
- owls_oe_rs: OWLS: Oral Expression Raw Score
- owls_oe_ss: OWLS: Oral Expression Standard Score
- owls_oc_sss: OWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores
- owls_oc_ss: OWLS: Oral Composite Standard Score
- owls_wes_trs: OWLS: Written Expression Scale Total Raw Score
- owls_wes_as: OWLS: Written Expression Scale Ability Score
- owls_wes_ss: OWLS: Written Expression Scale Standard Score
- owsl_lc: OWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)
- owls_lcss: OWLS: Language Composite Standard Score

In [29]:

# Test type
language_raw["test_name"] = None
language_raw["test_type"] = None
language_raw["score"] = None
CELP = language_raw.age_test_celp.notnull()
CELF = language_raw.age_test_celf.notnull()
PLS = language_raw.age_test_pls.notnull()
OWLS = language_raw.age_test_owls.notnull()

language_raw['age_test'] = None
language_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp
language_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf
language_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls
language_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls

language1 = language_raw[CELP | CELF | PLS | OWLS].copy()
language2 = language1.copy()

language1["test_type"] = "receptive"

language1.loc[CELP, "test_name"] = "CELF-P2"
language1.loc[CELF, "test_name"] = "CELF-4"
language1.loc[PLS, "test_name"] = "PLS"
language1.loc[OWLS, "test_name"] = "OWLS"

language1.loc[CELP, "score"] = language1.celfp_rl_ss
language1.loc[CELF, "score"] = language1.celf_rlss
language1.loc[PLS, "score"] = language1.pls_ac_ss
language1.loc[OWLS, "score"] = language1.owls_lc_ss


language2["test_type"] = "expressive"

language2.loc[CELP, "test_name"] = "CELF-P2"
language2.loc[CELF, "test_name"] = "CELF-4"
language2.loc[PLS, "test_name"] = "PLS"
language2.loc[OWLS, "test_name"] = "OWLS"

language2.loc[CELP, "score"] = language1.celfp_el_ss
language2.loc[CELF, "score"] = language1.celf_elss
language2.loc[PLS, "score"] = language1.pls_ec_ss
language2.loc[OWLS, "score"] = language1.owls_oe_ss

language = pd.concat([language1, language2])
language = language[language.score.notnull()]
print(pd.crosstab(language.test_name, language.test_type))
print("There are {0} null values for score".format(sum(language["score"].isnull())))

test_type  expressive  receptive
test_name                       
CELF-4            659        565
CELF-P2          1722       1728
OWLS             1259       1267
PLS              4072       4083
There are 0 null values for score

In [30]:

language["school"] = language.study_id.str.slice(0,4)
language = language[["study_id", "redcap_event_name", "score", "test_type", "test_name", "school", "age_test"]]
language["domain"] = "Language"
language.head()

Out[30]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain
0	0101-2002-0101	initial_assessment_arm_1	51	receptive	PLS	0101	54	Language
5	0101-2002-0101	year_5_complete_71_arm_1	61	receptive	OWLS	0101	113	Language
9	0101-2003-0102	initial_assessment_arm_1	55	receptive	PLS	0101	44	Language
10	0101-2003-0102	year_1_complete_71_arm_1	77	receptive	PLS	0101	54	Language
11	0101-2003-0102	year_2_complete_71_arm_1	93	receptive	CELF-P2	0101	68	Language

In [31]:

language['ageGroup'] = None # initial variable to none
language.loc[(language.age_test >= 36) & (language.age_test < 48), 'ageGroup'] = 3 
language.loc[(language.age_test >= 48) & (language.age_test < 60), 'ageGroup'] = 4 
language.loc[(language.age_test >= 60) & (language.age_test < 72), 'ageGroup'] = 5 
language.head()

Out[31]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain	ageGroup
0	0101-2002-0101	initial_assessment_arm_1	51	receptive	PLS	0101	54	Language	4
5	0101-2002-0101	year_5_complete_71_arm_1	61	receptive	OWLS	0101	113	Language	None
9	0101-2003-0102	initial_assessment_arm_1	55	receptive	PLS	0101	44	Language	3
10	0101-2003-0102	year_1_complete_71_arm_1	77	receptive	PLS	0101	54	Language	4
11	0101-2003-0102	year_2_complete_71_arm_1	93	receptive	CELF-P2	0101	68	Language	5

Expressive Language¶

In [32]:

if CHRIS:
    !mkdir DescriptiveFigures

mkdir: DescriptiveFigures: File exists

In [33]:

expressive_lang = language.loc[language.test_type=='expressive'].copy()
expressive_lang.head()

Out[33]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain	ageGroup
0	0101-2002-0101	initial_assessment_arm_1	60	expressive	PLS	0101	54	Language	4
5	0101-2002-0101	year_5_complete_71_arm_1	82	expressive	OWLS	0101	113	Language	None
9	0101-2003-0102	initial_assessment_arm_1	68	expressive	PLS	0101	44	Language	3
10	0101-2003-0102	year_1_complete_71_arm_1	77	expressive	PLS	0101	54	Language	4
11	0101-2003-0102	year_2_complete_71_arm_1	85	expressive	CELF-P2	0101	68	Language	5

Create models for converting scores¶

In [34]:

expressive_scores = (expressive_lang.groupby(['study_id','ageGroup'])
                     .apply(lambda x: x.pivot(columns='test_name', values='score')))

In [35]:

expressive_scores = (expressive_scores.set_index(expressive_scores.index.droplevel(2))
                         .reset_index()
                        .groupby(['study_id','ageGroup'])).apply(max)

In [36]:

expressive_owls_pls = expressive_scores[['OWLS', 'PLS']].dropna()
expressive_owls_pls.plot.scatter('OWLS', 'PLS')

Out[36]:

<matplotlib.axes._subplots.AxesSubplot at 0x111e49908>

In [37]:

from sklearn import linear_model
reg_owls = linear_model.LinearRegression()
reg_owls.fit(expressive_owls_pls.OWLS.values.reshape(-1,1), expressive_owls_pls.PLS.values)

Out[37]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [38]:

expressive_lang.test_name.value_counts()

Out[38]:

PLS        4072
CELF-P2    1722
OWLS       1259
CELF-4      659
Name: test_name, dtype: int64

In [39]:

expressive_lang['old_score'] = expressive_lang.score.copy()
pred_vals = reg_owls.predict(expressive_lang[expressive_lang.test_name=='OWLS'].score.values.reshape(-1,1))
expressive_lang.loc[expressive_lang.test_name=='OWLS', 'score'] = pred_vals

In [40]:

expressive_celf_pls = expressive_scores[['CELF-P2', 'PLS']].dropna()
expressive_celf_pls.plot.scatter('CELF-P2', 'PLS')

Out[40]:

<matplotlib.axes._subplots.AxesSubplot at 0x1134b3fd0>

In [41]:

reg_celf = linear_model.LinearRegression()
reg_celf.fit(expressive_celf_pls['CELF-P2'].values.reshape(-1,1), expressive_celf_pls.PLS.values)

Out[41]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [42]:

pred_vals = reg_celf.predict(expressive_lang[expressive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1))
expressive_lang.loc[expressive_lang.test_name=='CELF-P2', 'score'] = pred_vals

In [43]:

expressive_scores[['CELF-4', 'PLS']].dropna().plot.scatter('CELF-4', 'PLS')

Out[43]:

<matplotlib.axes._subplots.AxesSubplot at 0x1122b26a0>

There arent enough points to fit a model for CELF-2, so we will combine these directly.

In [44]:

# Test type
expressive_lang["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]

Goldman                 5728
Arizonia                 525
Arizonia and Goldman      91
Name: test_type, dtype: int64
There are 0 null values for test_type

In [45]:

plt.style.use('grayscale')
sns.set_style("whitegrid")
bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Language')
for i in [1,2,3]:
    y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expLang.png', dpi=300)
    
expressive_lang.ageGroup.value_counts()
# expressive_lang.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])

Out[45]:

3    1590
4    1572
5    1118
Name: ageGroup, dtype: int64

In [46]:

bp = expressive_lang.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Language')
for i in [1,2,3]:
    y = expressive_lang.score[expressive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expLang.png', dpi=300)

In [47]:

# need to do this for some reason...
expressive_lang['scoreInt'] = np.array(expressive_lang.score, dtype = 'int')
expressive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len])

Out[47]:

	mean	median	std	len
ageGroup
3	82.952201	82	15.582549	1590
4	81.650127	81	16.563240	1572
5	80.762970	80	18.071186	1118

Receptive Language¶

In [48]:

receptive_lang = language.loc[language.test_type=='receptive'].copy()
receptive_lang.head()

Out[48]:

	study_id	redcap_event_name	score	test_type	test_name	school	age_test	domain	ageGroup
0	0101-2002-0101	initial_assessment_arm_1	51	receptive	PLS	0101	54	Language	4
5	0101-2002-0101	year_5_complete_71_arm_1	61	receptive	OWLS	0101	113	Language	None
9	0101-2003-0102	initial_assessment_arm_1	55	receptive	PLS	0101	44	Language	3
10	0101-2003-0102	year_1_complete_71_arm_1	77	receptive	PLS	0101	54	Language	4
11	0101-2003-0102	year_2_complete_71_arm_1	93	receptive	CELF-P2	0101	68	Language	5

In [49]:

receptive_lang.test_name.value_counts()

Out[49]:

PLS        4083
CELF-P2    1728
OWLS       1267
CELF-4      565
Name: test_name, dtype: int64

In [50]:

receptive_scores = (receptive_lang.groupby(['study_id','ageGroup'])
                     .apply(lambda x: x.pivot(columns='test_name', values='score')))

In [51]:

receptive_scores = (receptive_scores.set_index(receptive_scores.index.droplevel(2))
                         .reset_index()
                        .groupby(['study_id','ageGroup'])).apply(max)

In [52]:

receptive_owls_pls = receptive_scores[['OWLS', 'PLS']].dropna()
receptive_owls_pls.plot.scatter('OWLS', 'PLS')

Out[52]:

<matplotlib.axes._subplots.AxesSubplot at 0x1157340b8>

In [53]:

receptive_celfp2_pls = receptive_scores[['CELF-P2', 'PLS']].dropna()
receptive_celfp2_pls.plot.scatter('CELF-P2', 'PLS')

Out[53]:

<matplotlib.axes._subplots.AxesSubplot at 0x113525668>

In [54]:

receptive_celf4_pls = receptive_scores[['CELF-4', 'PLS']].dropna()
receptive_celf4_pls.plot.scatter('CELF-4', 'PLS')

Out[54]:

<matplotlib.axes._subplots.AxesSubplot at 0x115449828>

In [55]:

reg_owls = linear_model.LinearRegression()
reg_owls.fit(receptive_owls_pls.OWLS.values.reshape(-1,1), receptive_owls_pls.PLS.values)

Out[55]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [56]:

receptive_lang['old_score'] = receptive_lang.score.copy()
pred_vals = reg_owls.predict(receptive_lang[receptive_lang.test_name=='OWLS'].score.values.reshape(-1,1))
receptive_lang.loc[receptive_lang.test_name=='OWLS', 'score'] = pred_vals

In [57]:

reg_celf = linear_model.LinearRegression()
reg_celf.fit(receptive_celfp2_pls['CELF-P2'].values.reshape(-1,1), receptive_celfp2_pls.PLS.values)

Out[57]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [58]:

pred_vals = reg_celf.predict(receptive_lang[receptive_lang.test_name=='CELF-P2'].score.values.reshape(-1,1))
receptive_lang.loc[receptive_lang.test_name=='CELF-P2', 'score'] = pred_vals

In [59]:

receptive_lang['scoreInt'] = np.array(receptive_lang.score, dtype = 'int')

In [60]:

bp = receptive_lang.boxplot(column='scoreInt', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Receptive Language')
for i in [1,2,3]:
    y = receptive_lang.score[receptive_lang.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/recLang.png', dpi=300)

receptive_lang.groupby('ageGroup')['scoreInt'].agg([np.mean, np.median, np.std, len])

Out[60]:

	mean	median	std	len
ageGroup
3	83.541509	84	18.573121	1590
4	83.525649	84	17.976144	1579
5	82.148858	81	17.876704	1095

Articulation¶

In [61]:

# Test type
articulation["test_type"] = None
ARIZ = articulation.aaps_ss.notnull()
GF = articulation.gf2_ss.notnull()
articulation = articulation[ARIZ | GF]
articulation.loc[(ARIZ & GF), "test_type"] = "Arizonia and Goldman"
articulation.loc[(ARIZ & ~GF), "test_type"] = "Arizonia"
articulation.loc[(~ARIZ & GF), "test_type"] = "Goldman"

print(articulation.test_type.value_counts())
print("There are {0} null values for test_type".format(sum(articulation["test_type"].isnull())))

# Test score (Arizonia if both)
articulation["score"] = articulation.aaps_ss
articulation.loc[(~ARIZ & GF), "score"] = articulation.gf2_ss[~ARIZ & GF]

Goldman                 5728
Arizonia                 525
Arizonia and Goldman      91
Name: test_type, dtype: int64
There are 0 null values for test_type

Map Arizonia onto Goldman¶

In [62]:

# create indicator variable if a student took either test
articulation.loc[articulation.aaps_ss.notnull() | articulation.gf2_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = articulation.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.aaps_ss.notnull() & temp.gf2_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()

0    6253
1      91
Name: both, dtype: int64

In [63]:

AAPS = temp.aaps_ss.notnull()
GF2 = temp.gf2_ss.notnull()

temp.loc[AAPS, "test_name"] = "AAPS"
temp.loc[GF2, "test_name"] = "GF2"

In [64]:

# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')

We have 6253 observations where a student took one test in a single year, but only 2932 unique students
We have 91 observations where a student took both test in a single year, but only 62 unique students

In [65]:

reg = linear_model.LinearRegression()
reg.fit(both.aaps_ss.values.reshape(-1,1), both.gf2_ss.values)

Out[65]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [66]:

articulation['old_score'] = articulation.score.copy()
pred_vals = reg.predict(articulation[articulation.test_type=='Arizonia'].score.values.reshape(-1,1))
articulation.loc[articulation.test_type=='Arizonia', 'score'] = pred_vals

In [67]:

articulation["school"] = articulation.study_id.str.slice(0,4)
articulation["age_test"] = articulation.age_test_aaps
articulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]
print(articulation.age_test.describe())

count    6340.000000
mean       68.697950
std        30.557327
min        23.000000
25%        47.000000
50%        60.000000
75%        81.000000
max       243.000000
Name: age_test, dtype: float64

In [68]:

articulation = articulation.drop(["age_test_aaps", "age_test_gf2", "aaps_ss", "gf2_ss"], axis=1)
articulation["domain"] = "Articulation"
articulation.head()

Out[68]:

	study_id	redcap_event_name	test_type	score	test	old_score	school	age_test	domain
1	0101-2002-0101	year_1_complete_71_arm_1	Goldman	78.0	1.0	78.0	0101	80.0	Articulation
9	0101-2003-0102	initial_assessment_arm_1	Goldman	72.0	1.0	72.0	0101	44.0	Articulation
10	0101-2003-0102	year_1_complete_71_arm_1	Goldman	97.0	1.0	97.0	0101	54.0	Articulation
14	0101-2004-0101	year_2_complete_71_arm_1	Goldman	75.0	1.0	75.0	0101	53.0	Articulation
15	0101-2004-0101	year_3_complete_71_arm_1	Goldman	80.0	1.0	80.0	0101	66.0	Articulation

In [69]:

articulation['ageGroup'] = None # initial variable to none
articulation.loc[(articulation.age_test >= 36) & (articulation.age_test < 48), 'ageGroup'] = 3 
articulation.loc[(articulation.age_test >= 48) & (articulation.age_test < 60), 'ageGroup'] = 4 
articulation.loc[(articulation.age_test >= 60) & (articulation.age_test < 72), 'ageGroup'] = 5 

bp = articulation.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Articulation')
for i in [1,2,3]:
    y = articulation.score[articulation.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/artic.png', dpi=300)

articulation.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])

Out[69]:

	mean	median	std	len
ageGroup
3	83.422264	84.0	18.680816	1281.0
4	82.868699	85.0	21.087332	1489.0
5	82.219630	86.0	21.043017	1131.0

Expressive Vocabulary¶

In [70]:

# Test type
expressive["test_type"] = None
EOWPVT = expressive.eowpvt_ss.notnull()
EVT = expressive.evt_ss.notnull()
expressive = expressive[EOWPVT | EVT]
expressive.loc[EOWPVT & EVT, "test_type"] = "EOWPVT and EVT"
expressive.loc[EOWPVT & ~EVT, "test_type"] = "EOWPVT"
expressive.loc[~EOWPVT & EVT, "test_type"] = "EVT"
print("There are {0} null values for test_type".format(sum(expressive["test_type"].isnull())))

expressive["score"] = expressive.eowpvt_ss
expressive.loc[~EOWPVT & EVT, "score"] = expressive.evt_ss[~EOWPVT & EVT]

There are 0 null values for test_type

In [71]:

expressive.test_type.value_counts()

Out[71]:

EVT               4271
EOWPVT            3087
EOWPVT and EVT     176
Name: test_type, dtype: int64

Map EVT to EOWPVT¶

In [72]:

# create indicator variable if a student took either test
expressive.loc[expressive.evt_ss.notnull() | expressive.eowpvt_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = expressive.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.evt_ss.notnull() & temp.eowpvt_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()

0    7358
1     176
Name: both, dtype: int64

In [73]:

EVT = temp.evt_ss.notnull()
EOWPVT = temp.eowpvt_ss.notnull()

temp.loc[EVT, "test_name"] = "EVT"
temp.loc[EOWPVT, "test_name"] = "EOWPVT"

In [74]:

# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')

We have 7358 observations where a student took one test in a single year, but only 3333 unique students
We have 176 observations where a student took both test in a single year, but only 127 unique students

In [75]:

del temp

In [76]:

reg = linear_model.LinearRegression()
reg.fit(both.evt_ss.values.reshape(-1,1), both.eowpvt_ss.values)

Out[76]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [77]:

expressive['old_score'] = expressive.score.copy()
pred_vals = reg.predict(expressive[expressive.test_type=='EVT'].score.values.reshape(-1,1))
expressive.loc[expressive.test_type=='EVT', 'score'] = pred_vals

In [78]:

expressive["school"] = expressive.study_id.str.slice(0,4)

In [79]:

expressive["age_test"] = expressive.age_test_eowpvt
expressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]

In [80]:

expressive = expressive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
expressive["domain"] = "Expressive Vocabulary"
expressive.head()

Out[80]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
0	0101-2002-0101	initial_assessment_arm_1	58.0	EOWPVT	0101	54.0	Expressive Vocabulary
2	0101-2002-0101	year_2_complete_71_arm_1	84.0	EOWPVT	0101	80.0	Expressive Vocabulary
5	0101-2002-0101	year_5_complete_71_arm_1	90.0	EOWPVT	0101	113.0	Expressive Vocabulary
14	0101-2004-0101	year_2_complete_71_arm_1	90.0	EOWPVT	0101	53.0	Expressive Vocabulary
15	0101-2004-0101	year_3_complete_71_arm_1	87.0	EOWPVT	0101	66.0	Expressive Vocabulary

In [81]:

expressive['ageGroup'] = None # initial variable to none
expressive.loc[(expressive.age_test >= 36) & (expressive.age_test < 48), 'ageGroup'] = 3 
expressive.loc[(expressive.age_test >= 48) & (expressive.age_test < 60), 'ageGroup'] = 4 
expressive.loc[(expressive.age_test >= 60) & (expressive.age_test < 72), 'ageGroup'] = 5 

bp = expressive.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Expressive Vocabulary')
for i in [1,2,3]:
    y = expressive.score[expressive.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/expVocab.png', dpi=300)

expressive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])

Out[81]:

	mean	median	std	len
ageGroup
3	90.898049	92.770977	18.628653	1542.0
4	90.023118	91.000000	18.967224	1726.0
5	89.124085	90.000000	17.289723	1283.0

Receptive Vocabulary¶

In [82]:

receptive.columns

Out[82]:

Index(['study_id', 'redcap_event_name', 'age_test_ppvt', 'ppvt_ss',
       'age_test_rowpvt', 'rowpvt_ss'],
      dtype='object')

In [83]:

# Test type
receptive["test_type"] = None
PPVT = receptive.ppvt_ss.notnull()
ROWPVT = receptive.rowpvt_ss.notnull()
receptive = receptive[PPVT | ROWPVT]
receptive.loc[PPVT & ROWPVT, "test_type"] = "PPVT and ROWPVT"
receptive.loc[PPVT & ~ROWPVT, "test_type"] = "PPVT"
receptive.loc[~PPVT & ROWPVT, "test_type"] = "ROWPVT"
print("There are {0} null values for test_type".format(sum(receptive["test_type"].isnull())))

receptive["score"] = receptive.ppvt_ss
receptive.loc[~PPVT & ROWPVT, "score"] = receptive.rowpvt_ss[~PPVT & ROWPVT]

There are 0 null values for test_type

Map PPVT onto ROWPVT¶

In [84]:

# create indicator variable if a student took either test
receptive.loc[receptive.ppvt_ss.notnull() | receptive.rowpvt_ss.notnull(), 'test'] = 1
# drop observations when neither test was taken
temp = receptive.dropna(subset = ['test'])
# Can drop test variable
temp = temp.drop('test', 1)
# Create new variable if student took both tests in one observation
temp['both'] = 0
temp.loc[temp.ppvt_ss.notnull() & temp.rowpvt_ss.notnull(), 'both'] = 1
print(temp.both.value_counts()) # 73 students took both tests, 5716 took only one
# temp.head()

0    7497
1     228
Name: both, dtype: int64

In [85]:

PPVT = temp.ppvt_ss.notnull()
ROWPVT = temp.rowpvt_ss.notnull()

temp.loc[PPVT, "test_name"] = "PPVT"
temp.loc[ROWPVT, "test_name"] = "ROWPVT"

In [86]:

# One test
single = temp.loc[temp.both==0,]
a = single.shape[0]
single = single.groupby('study_id').last()
b = single.shape[0]
print('We have', a, 'observations where a student took one test in a single year, but only', b, 'unique students')

# Both tests
both = temp.loc[temp.both==1,]
a = both.shape[0]
both = both.groupby('study_id').last()
b = both.shape[0]
print('We have', a, 'observations where a student took both test in a single year, but only', b, 'unique students')

We have 7497 observations where a student took one test in a single year, but only 3364 unique students
We have 228 observations where a student took both test in a single year, but only 157 unique students

In [87]:

del temp

In [88]:

rv_reg = linear_model.LinearRegression()
rv_reg.fit(both.ppvt_ss.values.reshape(-1,1), both.rowpvt_ss.values)

Out[88]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [89]:

receptive['old_score'] = receptive.score.copy()
pred_vals = reg.predict(receptive[receptive.test_type=='PPVT'].score.values.reshape(-1,1))
receptive.loc[receptive.test_type=='PPVT', 'score'] = pred_vals

In [90]:

receptive["school"] = receptive.study_id.str.slice(0,4)

In [91]:

receptive["age_test"] = receptive.age_test_ppvt
receptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]

In [92]:

print("There are {0} null values for age_test".format(sum(receptive.age_test.isnull())))

There are 23 null values for age_test

In [93]:

receptive = receptive[["study_id", "redcap_event_name", "score", "test_type", "school", "age_test"]]
receptive["domain"] = "Receptive Vocabulary"
receptive.head()

Out[93]:

	study_id	redcap_event_name	score	test_type	school	age_test	domain
2	0101-2002-0101	year_2_complete_71_arm_1	86.408510	PPVT	0101	80.0	Receptive Vocabulary
5	0101-2002-0101	year_5_complete_71_arm_1	101.000000	ROWPVT	0101	113.0	Receptive Vocabulary
9	0101-2003-0102	initial_assessment_arm_1	58.572717	PPVT	0101	44.0	Receptive Vocabulary
10	0101-2003-0102	year_1_complete_71_arm_1	78.455426	PPVT	0101	54.0	Receptive Vocabulary
11	0101-2003-0102	year_2_complete_71_arm_1	95.156903	PPVT	0101	68.0	Receptive Vocabulary

In [94]:

receptive.study_id.unique().shape

Out[94]:

(3404,)

In [95]:

receptive['ageGroup'] = None # initial variable to none
receptive.loc[(receptive.age_test >= 36) & (receptive.age_test < 48), 'ageGroup'] = 3 
receptive.loc[(receptive.age_test >= 48) & (receptive.age_test < 60), 'ageGroup'] = 4 
receptive.loc[(receptive.age_test >= 60) & (receptive.age_test < 72), 'ageGroup'] = 5 

In [96]:

bp = receptive.boxplot(column='score', by='ageGroup', grid=False, sym='')
plt.xlabel('Age (years)'); plt.ylabel('Standard score');
plt.suptitle('Receptive Vocabulary')
for i in [1,2,3]:
    y = receptive.score[receptive.ageGroup==i+2].dropna()
    # Add some random "jitter" to the x-axis
    x = np.random.normal(i, 0.04, size=len(y))
    plt.plot(x, y.values, 'k.', alpha=0.05)
plt.savefig('DescriptiveFigures/recVocab.png', dpi=300)

receptive.groupby('ageGroup')['score'].agg([np.mean, np.median, np.std, len])

Out[96]:

	mean	median	std	len
ageGroup
3	89.766914	90.385052	16.611238	1603.0
4	88.455310	89.589744	17.323336	1756.0
5	87.784735	88.794435	15.319930	1312.0

In [97]:

width = 3
sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds")
sns.kdeplot(np.array(receptive[receptive.ageGroup ==4].score), bw=width, label = "4-year-olds")
sns.kdeplot(np.array(receptive[receptive.ageGroup ==5].score), bw=width, label = "5-year-olds")

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

Out[97]:

<matplotlib.axes._subplots.AxesSubplot at 0x115c18cc0>

In [98]:

receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=2)
# plt.legend(bbox_to_anchor=(1.05, 1),loc=2, borderaxespad=0.)
# plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2),
#           ncol=3, fancybox=True, shadow=True)

Out[98]:

<matplotlib.legend.Legend at 0x1190c5320>

Merge Datasets¶

In [99]:

test_scores = pd.concat([articulation, expressive, receptive, language])
test_scores.head()

Out[99]:

	ageGroup	age_test	domain	old_score	redcap_event_name	school	score	study_id	test	test_name	test_type
1	None	80	Articulation	78.0	year_1_complete_71_arm_1	0101	78	0101-2002-0101	1.0	NaN	Goldman
9	3	44	Articulation	72.0	initial_assessment_arm_1	0101	72	0101-2003-0102	1.0	NaN	Goldman
10	4	54	Articulation	97.0	year_1_complete_71_arm_1	0101	97	0101-2003-0102	1.0	NaN	Goldman
14	4	53	Articulation	75.0	year_2_complete_71_arm_1	0101	75	0101-2004-0101	1.0	NaN	Goldman
15	5	66	Articulation	80.0	year_3_complete_71_arm_1	0101	80	0101-2004-0101	1.0	NaN	Goldman

In [100]:

print(test_scores.test_type.value_counts())
print(test_scores.domain.value_counts())

expressive              7712
receptive               7643
Goldman                 5728
PPVT                    4893
EVT                     4271
EOWPVT                  3087
ROWPVT                  2604
Arizonia                 525
PPVT and ROWPVT          228
EOWPVT and EVT           176
Arizonia and Goldman      91
Name: test_type, dtype: int64
Language                 15355
Receptive Vocabulary      7725
Expressive Vocabulary     7534
Articulation              6344
Name: domain, dtype: int64

In [101]:

# make a new domain variable that contains only 5 categories
test_scores['domain2'] = test_scores.domain.copy()
test_scores.loc[test_scores.test_type == 'expressive', 'domain2'] = 'Expressive Language'
test_scores.loc[test_scores.test_type == 'receptive', 'domain2'] = 'Receptive Language'
test_scores.domain2.value_counts()

Out[101]:

Receptive Vocabulary     7725
Expressive Language      7712
Receptive Language       7643
Expressive Vocabulary    7534
Articulation             6344
Name: domain2, dtype: int64

In [102]:

# make a dataframe that contains only 3, 4, and 5-year-olds
test_scores_345 = test_scores.loc[(test_scores.age_test >= 36) & (test_scores.age_test < 72), ]
test_scores_345.domain2.value_counts()

Out[102]:

Receptive Vocabulary     4671
Expressive Vocabulary    4551
Expressive Language      4280
Receptive Language       4264
Articulation             3901
Name: domain2, dtype: int64

In [124]:

# Attempt to make black and white
bp = test_scores_345.boxplot(column='score', by='domain2', grid=False, sym='')
plt.title('Test scores for 3, 4, and 5-year-olds by functional outcome')
plt.suptitle("")
plt.xlabel(''); plt.ylabel('Standard score');
plt.xticks([1, 2, 3, 4, 5], ['Articulation', 'Expressive\nLanguage', 'Expressive\nVocabulary',
                            'Receptive\nLanguage', 'Receptive\nVocabulary'])
# Articulation
y = test_scores_345.score[test_scores_345.domain2=='Articulation'].dropna()
x = np.random.normal(1, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)

# Expressive Language
y = test_scores_345.score[test_scores_345.domain2=='Expressive Language'].dropna()
x = np.random.normal(2, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

# Expressive Vocabulary
y = test_scores_345.score[test_scores_345.domain2=='Expressive Vocabulary'].dropna()
x = np.random.normal(3, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02)  

# Receptive Language
y = test_scores_345.score[test_scores_345.domain2=='Receptive Language'].dropna()
x = np.random.normal(4, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

# Receptive Vocabulary
y = test_scores_345.score[test_scores_345.domain2=='Receptive Vocabulary'].dropna()
x = np.random.normal(5, 0.08, size=len(y))
plt.plot(x, y.values, 'k.', alpha=0.02) 

plt.axhline(y=85)

# for components in bp.keys():
#    for line in bp[components]:
#        line.set_color('black')     # black lines

Out[124]:

<matplotlib.lines.Line2D at 0x114e37e10>

Histograms¶

Blue = three year olds
Green = four year olds
Red = five year olds

Need to make in black and white.
Need to make legend

Expressive Language¶

In [104]:

sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=False, hist=True, norm_hist=False);

In [105]:

fig = sns.distplot(np.array(receptive[receptive.ageGroup ==3].score), kde=True, hist=False)
plt.yticks(fig.get_yticks(), fig.get_yticks() * 6735)
plt.ylabel('Counts', fontsize=16)

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

Out[105]:

<matplotlib.text.Text at 0x11901ce10>

In [106]:

sns.kdeplot(np.array(receptive[receptive.ageGroup ==3].score), bw=width, label = "3-year-olds")
plt.ylabel('Density')
plt.yticks([0, 0.01, 0.02]);
# plt.yticks(fig.get_yticks(), fig.get_yticks() * 100)

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [107]:

def kdeplot(domain, width, score='score'):
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==3, score]), bw=width, label = "3-year-olds")
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==4, score]), bw=width, label = "4-year-olds")
    sns.kdeplot(np.array(domain.loc[domain.ageGroup ==5, score]), bw=width, label = "5-year-olds")
    plt.ylabel('Density')

In [108]:

kdeplot(expressive_lang, 4)

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [109]:

expressive_lang[expressive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
expressive_lang[expressive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
expressive_lang[expressive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=1)

Out[109]:

<matplotlib.legend.Legend at 0x113fa9d30>

Receptive Language¶

In [110]:

kdeplot(receptive_lang, 4)

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [111]:

receptive_lang[receptive_lang.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive_lang[receptive_lang.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
receptive_lang[receptive_lang.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=1)

Out[111]:

<matplotlib.legend.Legend at 0x115f819e8>

Articulation¶

In [112]:

kdeplot(articulation, 4)

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [113]:

articulation[articulation.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
articulation[articulation.ageGroup ==4].score.hist(alpha=0.3, label='4-year-olds')
articulation[articulation.ageGroup ==5].score.hist(alpha=0.3, label='5-year-olds')
plt.legend(loc=2)

Out[113]:

<matplotlib.legend.Legend at 0x11336f358>

Expressive Vocab¶

In [114]:

kdeplot(expressive, 4)

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [115]:

expressive[expressive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
expressive[expressive.ageGroup ==4].score.hist(alpha=0.3, label='3-year-olds')
expressive[expressive.ageGroup ==5].score.hist(alpha=0.3, label='3-year-olds')
plt.legend(loc=2)

Out[115]:

<matplotlib.legend.Legend at 0x1186c3898>

Receptive Vocab¶

In [116]:

kdeplot(receptive, 4)

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [117]:

receptive[receptive.ageGroup ==3].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==4].score.hist(alpha=0.3, label='3-year-olds')
receptive[receptive.ageGroup ==5].score.hist(alpha=0.3, label='3-year-olds')
plt.legend(loc=2)

Out[117]:

<matplotlib.legend.Legend at 0x115c3c9b0>

In [118]:

def kdesubplot(domain, width, ax):
    sns.kdeplot(np.array(domain[domain.ageGroup ==3].score), bw=width, label = "3-year-olds", ax=ax)
    sns.kdeplot(np.array(domain[domain.ageGroup ==4].score), bw=width, label = "4-year-olds", ax=ax)
    sns.kdeplot(np.array(domain[domain.ageGroup ==5].score), bw=width, label = "5-year-olds", ax=ax)
    plt.ylabel('Density')

In [122]:

domains = [expressive_lang, receptive_lang, expressive, receptive, articulation]
names = ['Expressive language', 'Receptive language', 'Expressive vocabulary', 'Receptive vocabulary', 'Articulation']

fig, axes = plt.subplots(3,2, figsize=(10,6))
for i,ax in enumerate(axes.ravel()[:-1]):
    kdesubplot(domains[i], 4, ax)
    ax.set_title(names[i])
    ax.set_yticks([])
    if i==2:
        ax.set_ylabel('Density')
    if i>2:
        ax.set_xlabel('Score')
    if i:
        ax.legend_.remove()


axes[-1,-1].axis('off')
plt.tight_layout()
# fig.text(0, 0.5, 'Density', va='center', rotation='vertical')

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [120]:

plt.figure()
plt.subplot(3,2,1)
kdeplot(expressive_lang, 4)
plt.title('Expressive Language')
plt.subplot(3,2,2)
kdeplot(receptive_lang, 4)
plt.title('Receptive Language')
plt.subplot(3,2,3)
kdeplot(expressive, 4)
plt.title('Expressive Vocabulary')
plt.subplot(3,2,4)
kdeplot(receptive, 4)
plt.title('Receptive Vocabulary')
plt.subplot(3,2,5)
kdeplot(articulation, 4)
plt.title('Articulation')

plt.tight_layout()

/Users/fonnescj/anaconda3/envs/dev/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j