import numpy as np
import pandas as pd
import patsy
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.stats as sm_stats
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("white")
# import the dataset csv file into a Pandas dataframe object:
sleep = pd.read_csv('../Datasets/sleep.txt', sep='\t')
sleep = sleep.dropna(axis=0) # drop on axis 0 (i.e. rows)
# variable creation:
sleep['log_BodyWt'] = np.log(sleep['BodyWt'])
sleep['danger_cat'] = 'low' # set as a string, "low" for all.
sleep.loc[sleep['Danger'] > 3, 'danger_cat'] = 'high' # set values of 4 and 5 to "high"
sleep['life_cat'] = pd.qcut(sleep['LifeSpan'], [0, .33, .66, 1])
sleep['life_cat'] = sleep['life_cat'].cat.rename_categories(['short', 'med', 'long']) # rename categories
sleep['life_cat'] = sleep['life_cat'].astype('object') # make it an object, rather than a category
sleep.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 42 entries, 1 to 60 Data columns (total 14 columns): Species 42 non-null object BodyWt 42 non-null float64 BrainWt 42 non-null float64 NonDreaming 42 non-null float64 Dreaming 42 non-null float64 TotalSleep 42 non-null float64 LifeSpan 42 non-null float64 Gestation 42 non-null float64 Predation 42 non-null int64 Exposure 42 non-null int64 Danger 42 non-null int64 log_BodyWt 42 non-null float64 danger_cat 42 non-null object life_cat 42 non-null object dtypes: float64(8), int64(3), object(3) memory usage: 4.9+ KB
We can build the simple model TotalSleep ~ log_BodyWt + danger_cat
first, to see how categorical and continuous variables work together in the design matrix.
X = patsy.dmatrix('log_BodyWt + danger_cat', sleep)
X
DesignMatrix with shape (42, 3) Intercept danger_cat[T.low] log_BodyWt 1 1 0.00000 1 0 7.84267 1 0 2.35613 1 1 -3.77226 1 0 5.07517 1 1 1.19392 1 1 3.95432 1 0 -0.85567 1 0 6.14204 1 1 -2.59027 1 1 1.09861 1 1 -0.24207 1 1 -1.60944 1 0 3.31999 1 1 -2.12026 1 1 4.44265 1 1 -2.29263 1 0 0.03922 1 0 6.25575 1 0 -5.29832 1 1 -4.60517 1 1 4.12713 1 1 -3.77226 1 1 -3.03655 1 1 0.53063 1 1 1.25276 1 1 -0.73397 1 0 2.30259 1 1 0.48243 1 0 5.25750 [12 rows omitted] Terms: 'Intercept' (column 0) 'danger_cat' (column 1) 'log_BodyWt' (column 2) (to view full data, use np.asarray(this_obj))
# fit the model:
fit = smf.glm('TotalSleep ~ log_BodyWt + danger_cat', sleep).fit()
print(fit.summary())
Generalized Linear Model Regression Results ============================================================================== Dep. Variable: TotalSleep No. Observations: 42 Model: GLM Df Residuals: 39 Model Family: Gaussian Df Model: 2 Link Function: identity Scale: 12.9203335198 Method: IRLS Log-Likelihood: -111.77 Date: Tue, 24 Feb 2015 Deviance: 503.89 Time: 17:08:10 Pearson chi2: 504. No. Iterations: 4 ===================================================================================== coef std err z P>|z| [95.0% Conf. Int.] ------------------------------------------------------------------------------------- Intercept 9.3607 1.117 8.378 0.000 7.171 11.551 danger_cat[T.low] 2.8230 1.324 2.132 0.033 0.228 5.418 log_BodyWt -0.7461 0.206 -3.625 0.000 -1.150 -0.343 =====================================================================================
We have three coefficients: the Intercept
($\beta_1$), danger_cat[T.low]
($\beta_2$) and log_BodyWt
($\beta_3$). What do they mean?
If danger_cat
is high
(i.e. the species is in the "high" danger group), then the second beta weight is multiplied by zero. So for the high
group, we just have a line relating log bodyweight and total sleep, whose intercept is 9.36 (i.e. when log bodyweight is zero).
When danger_cat
is low
, the second beta weight of 2.82 is added to the prediction. This has the effect of shifting the whole line relating bodyweight to sleep upwards --- but does not change its slope.
This model is the hypothesis that the only effect of low danger is to raise the average amount the animal sleeps, keeping the relationship between bodyweight and sleep the same.
We visualise these predictions after we fit the more complex model, below.
life_cat
)¶X = patsy.dmatrix('log_BodyWt + danger_cat + life_cat', sleep)
X
DesignMatrix with shape (42, 5) Columns: ['Intercept', 'danger_cat[T.low]', 'life_cat[T.med]', 'life_cat[T.short]', 'log_BodyWt'] Terms: 'Intercept' (column 0) 'danger_cat' (column 1) 'life_cat' (columns 2:4) 'log_BodyWt' (column 4) (to view full data, use np.asarray(this_obj))
X[0:10, :]
array([[ 1. , 1. , 0. , 1. , 0. ], [ 1. , 0. , 0. , 0. , 7.84267147], [ 1. , 0. , 0. , 0. , 2.35612586], [ 1. , 1. , 1. , 0. , -3.77226106], [ 1. , 0. , 0. , 0. , 5.07517382], [ 1. , 1. , 0. , 0. , 1.19392247], [ 1. , 1. , 0. , 0. , 3.95431592], [ 1. , 0. , 1. , 0. , -0.85566611], [ 1. , 0. , 0. , 0. , 6.14203741], [ 1. , 1. , 0. , 1. , -2.59026717]])
fit_2 = smf.glm('TotalSleep ~ log_BodyWt + danger_cat + life_cat', sleep).fit()
print(fit_2.summary())
Generalized Linear Model Regression Results ============================================================================== Dep. Variable: TotalSleep No. Observations: 42 Model: GLM Df Residuals: 37 Model Family: Gaussian Df Model: 4 Link Function: identity Scale: 13.4347736658 Method: IRLS Log-Likelihood: -111.49 Date: Tue, 24 Feb 2015 Deviance: 497.09 Time: 17:08:10 Pearson chi2: 497. No. Iterations: 4 ===================================================================================== coef std err z P>|z| [95.0% Conf. Int.] ------------------------------------------------------------------------------------- Intercept 10.0980 1.601 6.306 0.000 6.960 13.236 danger_cat[T.low] 2.6883 1.396 1.926 0.054 -0.047 5.424 life_cat[T.med] -1.1116 1.573 -0.707 0.480 -4.195 1.972 life_cat[T.short] -0.6439 1.759 -0.366 0.714 -4.091 2.803 log_BodyWt -0.8236 0.253 -3.254 0.001 -1.320 -0.327 =====================================================================================
What is the situation when all the non-intercept columns of the design matrix equal zero? Answer: when danger_cat
is high
, life_cat
is long
and log_BodyWt
is $0$.
So the intercept is the predicted total amount of sleep, in hours, of species when they are in high danger, live for a relatively long time, and weigh 1 kg (exp(0) = 1
).
Note that no such species necessarily has to exist in the dataset (it doesn't):
sleep['Species'][(sleep['life_cat']=='long') &
(sleep['danger_cat']=='high') &
(sleep['log_BodyWt']==0.)]
Series([], name: Species, dtype: object)
When the amount of danger is low, this corresponds to the predicted difference in total sleep from the intercept term, all other things being equal. So if we had a species that had low danger, lived for a long time and weighed 1 kg, it would be expected to sleep for $10 + 2.69 = 12.7$ hours.
All other things being equal to the intercept case, this is the expected difference in total sleep for when lifespan is "medium" instead of "long".
All other things being equal to the intercept case, this is the expected difference in total sleep for when lifespan is "low" instead of "long".
The expected total sleep duration drops by 0.82 hours for every log unit increase in bodyweight.
Work out the predicted total sleep for a species that is
We can do this manually, by accessing the fit parameters and multiplying them:
# intercept * low danger * medium lifespan * log bodyweight:
fit_2.params[0] + \
(fit_2.params[1] * 1) + \
(fit_2.params[2] * 1) + \
(fit_2.params[3] * 0) + \
(fit_2.params[4] * np.log(13)) # remember we need the log of 13 kg
# the "\" character lets you continue a statement over multiple lines
9.5621480810353781
So for an animal at low danger, with a medium lifespan and weighing 13 kg, we predict they sleep about 9.6 hours.
The manual multiplication above is awkward and prone to errors in larger design matrices. We can instead use the predict
method, and pass a new design matrix of our own construction:
new_dat = pd.DataFrame({'log_BodyWt': np.log(13),
'danger_cat': 'low',
'life_cat': 'med'
}, index=[0])
new_dat
danger_cat | life_cat | log_BodyWt | |
---|---|---|---|
0 | low | med | 2.564949 |
fit_2.predict(new_dat)
array([ 9.56214808])
The prediction ($\mathbf X \beta$) is calculated internally. Note that for this to work, the column names and the values they take on must exactly match the original data --- otherwise the predict method doesn't know what to do.
Unfortunately statsmodels
does not yet have a neat way to visualise predictions from models with continuous and categorical predictors.
To make some nice visualisations we can use Seaborn. Recall that most of Seaborn's cool functionality expects data as a data frame. What happens if we just add predictions to our existing sleep
dataframe?
TotalSleep ~ log_BodyWt + danger_cat
)¶# add the predictions to the dataframe:
sleep['yhat'] = fit.predict()
# set up some axes using Seaborn's facetgrid:
g = sns.FacetGrid(sleep, hue='danger_cat', size=5)
# use the `map` method to add stuff to the facetgrid axes:
g.map(plt.plot, "log_BodyWt", "yhat")
g.map(plt.scatter, "log_BodyWt", "TotalSleep")
sns.despine(offset=10)
g.set_titles
g.add_legend();
TotalSleep ~ log_BodyWt + danger_cat + life_cat
)¶# add the predictions to the dataframe:
sleep['yhat_2'] = fit_2.predict()
# set up some axes using Seaborn's facetgrid:
g = sns.FacetGrid(sleep, hue='danger_cat', col="life_cat", size=3)
# use the `map` method to add stuff to the facetgrid axes:
g.map(plt.plot, "log_BodyWt", "yhat_2")
g.map(plt.scatter, "log_BodyWt", "TotalSleep")
sns.despine(offset=5)
g.set_titles
g.add_legend();
Here we can see this plotting start to break down. Why are the model prediction lines all different lengths? Why is there no "high" danger line in the "short" lifespan group?
The reason is that the predictions only extend as far as our data does. There's no blue line in the last facet because there's only one "high" danger animal in the "short" lifespan category -- and hence no second x value to draw a line to.
To make the model predictions consistent, we can create a new dataframe that contains all combinations of the variables we need. To do this we can use a function, given below, called expand_grid
.
# define expand grid function:
def expand_grid(data_dict):
""" A port of R's expand.grid function for use with Pandas dataframes.
from http://pandas.pydata.org/pandas-docs/stable/cookbook.html?highlight=expand%20grid
"""
rows = itertools.product(*data_dict.values())
return pd.DataFrame.from_records(rows, columns=data_dict.keys())
# build a new matrix with expand grid:
predict_frame = expand_grid(
{'log_BodyWt': np.linspace(-6, 9, 2),
'life_cat': ['short', 'med', 'long'],
'danger_cat': ['low', 'high']})
predict_frame
log_BodyWt | life_cat | danger_cat | |
---|---|---|---|
0 | -6 | short | low |
1 | -6 | short | high |
2 | -6 | med | low |
3 | -6 | med | high |
4 | -6 | long | low |
5 | -6 | long | high |
6 | 9 | short | low |
7 | 9 | short | high |
8 | 9 | med | low |
9 | 9 | med | high |
10 | 9 | long | low |
11 | 9 | long | high |
So in predict_frame
, we have a number of rows containing all combinations of the values we need. Note here we have only two values for the x-axis (log_BodyWt) that we want to plot. This is sufficient here because our model is linear -- you can draw a line between two points. If we had a nonlinear response (e.g., a logistic link function) we would need to make a bigger vector (using np.linspace
with more than 2 points) to show the curve of our predictions appropriately.
Now add our model predictions:
predict_frame['yhat'] = fit.predict(predict_frame)
predict_frame['yhat_2'] = fit_2.predict(predict_frame)
predict_frame
log_BodyWt | life_cat | danger_cat | yhat | yhat_2 | |
---|---|---|---|---|---|
0 | -6 | short | low | 16.660219 | 17.084153 |
1 | -6 | short | high | 13.837226 | 14.395840 |
2 | -6 | med | low | 16.660219 | 16.616439 |
3 | -6 | med | high | 13.837226 | 13.928126 |
4 | -6 | long | low | 16.660219 | 17.728076 |
5 | -6 | long | high | 13.837226 | 15.039763 |
6 | 9 | short | low | 5.468932 | 4.729805 |
7 | 9 | short | high | 2.645938 | 2.041492 |
8 | 9 | med | low | 5.468932 | 4.262091 |
9 | 9 | med | high | 2.645938 | 1.573778 |
10 | 9 | long | low | 5.468932 | 5.373728 |
11 | 9 | long | high | 2.645938 | 2.685415 |
Since Seaborn plotting expects one, and only one, data frame, in this case we can just append the predictions dataframe onto the sleep dataframe (i.e. we tack it on the end, after the last row in sleep
). Any columns not present in predict_frame
are filled with NaNs
.
# munge data togther.
merged_df = sleep.append(predict_frame) # use append method to tack predict_frame onto the end.
merged_df.tail()
BodyWt | BrainWt | Danger | Dreaming | Exposure | Gestation | LifeSpan | NonDreaming | Predation | Species | TotalSleep | danger_cat | life_cat | log_BodyWt | yhat | yhat_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | high | short | 9 | 2.645938 | 2.041492 |
8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | low | med | 9 | 5.468932 | 4.262091 |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | high | med | 9 | 2.645938 | 1.573778 |
10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | low | long | 9 | 5.468932 | 5.373728 |
11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | high | long | 9 | 2.645938 | 2.685415 |
# Now let's do the plots again, using merged frame:
g = sns.FacetGrid(merged_df, hue='danger_cat', size=5)
# use the `map` method to add stuff to the facetgrid axes:
g.map(plt.plot, "log_BodyWt", "yhat")
g.map(plt.scatter, "log_BodyWt", "TotalSleep")
sns.despine(offset=10)
g.set_xlabels('Bodyweight [log kg]')
g.set_ylabels('Sleep [hours]')
g.add_legend(title='Danger');
g = sns.FacetGrid(merged_df, hue='danger_cat', col='life_cat', size=3)
# use the `map` method to add stuff to the facetgrid axes:
g.map(plt.plot, "log_BodyWt", "yhat_2")
g.map(plt.scatter, "log_BodyWt", "TotalSleep")
sns.despine(offset=10)
g.fig.subplots_adjust(wspace=0.3)
g.set_xlabels('Bodyweight [log kg]')
g.set_ylabels('Sleep [hours]')
g.add_legend(title='Danger');
So now we have some nice plots, showing how the categorical variables (danger and lifespan) change the model predictions. Note how the slope of the relationship between bodyweight and sleep is the same in all panels -- that's by design in the model. Adding in the categorical variables only changes the intercepts, not the slope. Changing the slopes requires having an interaction term... which we will talk about next week.
from IPython.core.display import HTML
def css_styling():
styles = open("../custom_style.css", "r").read()
return HTML(styles)
css_styling()