# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
galton = Table.read_table('galton.csv')
heights = Table().with_columns(
'MidParent', galton.column('midparentHeight'),
'Child', galton.column('childHeight')
)
heights
def standard_units(arr):
return (arr - np.average(arr))/np.std(arr)
def correlation(t, x, y):
x_standard = standard_units(t.column(x))
y_standard = standard_units(t.column(y))
return np.average(x_standard * y_standard)
def slope(t, x, y):
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
def intercept(t, x, y):
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
def fitted_values(t, x, y):
"""Return an array of the regression estimates at all the x values"""
a = slope(t, x, y)
b = intercept(t, x, y)
return a*t.column(x) + b
def residuals(t, x, y):
predictions = fitted_values(t, x, y)
return t.column(y) - predictions
heights = heights.with_columns(
'Fitted Value', fitted_values(heights, 'MidParent', 'Child'),
'Residual', residuals(heights, 'MidParent', 'Child')
)
heights
correlation(heights, 'MidParent', 'Child')
heights.scatter('MidParent')
def plot_residuals(t, x, y):
tbl = t.with_columns(
'Fitted', fitted_values(t, x, y),
'Residual', residuals(t, x, y)
)
tbl.select(x, y, 'Fitted').scatter(0)
tbl.scatter(x, 'Residual')
plot_residuals(heights, 'MidParent', 'Child')
# Length in meters
# Age in years
# Ages are estimated based on variables (e.g. condition of teeth)
dugong = Table.read_table('dugong.csv')
dugong.show(5)
dugong.scatter('Length', 'Age')
correlation(dugong, 'Length', 'Age')
plot_residuals(dugong, 'Length', 'Age')
# Height and average weight of US women
us_women = Table.read_table('us_women.csv')
us_women.show(5)
correlation(us_women, 'height', 'ave weight')
plot_residuals(us_women, 'height', 'ave weight')
demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)
correlation(demographics, 'Median Income', 'Percent voting for Clinton')
plot_residuals(demographics, 'Median Income', 'Percent voting for Clinton')
movies = Table.read_table('actors.csv')
movies.show(3)
plot_residuals(movies, 'Number of Movies', 'Average per Movie')
movies.sort("Average per Movie", descending = True)
# Nonlinear
round(np.average(residuals(dugong, 'Length', 'Age')), 6)
# Linear
round(np.average(residuals(heights, 'MidParent', 'Child')), 6)
# Heteroscedasticity ("uneven spread")
round(np.average(residuals(demographics, 'Median Income', 'Percent voting for Clinton')), 6)
def plot_fitted(t, x, y):
tbl = t.select(x, y)
tbl.with_columns('Fitted Value', fitted_values(t, x, y)).scatter(0)
plot_fitted(heights, 'MidParent', 'Child')
child_predictions_sd = np.std(fitted_values(heights, 'MidParent', 'Child'))
child_observed_sd = np.std(heights.column('Child'))
print(child_predictions_sd)
print(child_observed_sd)
child_predictions_sd / child_observed_sd
correlation(heights, 'MidParent', 'Child')
correlation(dugong, 'Length', 'Age')
dugong_prediction_sd = np.std(fitted_values(dugong, 'Length', 'Age'))
dugong_observed_sd = np.std(dugong.column(1))
dugong_prediction_sd / dugong_observed_sd
hybrid = Table.read_table('hybrid.csv')
hybrid.show(5)
plot_residuals(hybrid, 'acceleration', 'mpg')
correlation(hybrid, 'acceleration', 'mpg')
np.std(fitted_values(hybrid, 'acceleration', 'mpg'))/np.std(hybrid.column('mpg'))
No matter what the shape of the scatter plot, the SD of the fitted values is a fraction of the SD of the observed values of $y$. The fraction is |r|.
$$ \frac{\mbox{SD of fitted values}}{\mbox{SD of }y} ~=~ |r| ~~~~~~~~~~ \mbox{That is,} ~~ \mbox{SD of fitted values} = |r|\cdot \mbox{SD of }y $$No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is $\sqrt{1-r^2}$.
$$ \mbox{SD of residuals} ~=~ \sqrt{1 - r^2} \cdot \mbox{SD of }y $$plot_fitted(heights, 'MidParent', 'Child')
plot_fitted(heights, 'MidParent', 'Child')
ave_child = np.mean(heights.column('Child'))
plots.plot([64, 76], [ave_child, ave_child]);
np.std(heights.column('Child')) ** 2
np.std(residuals(heights, 'MidParent', 'Child')) ** 2
np.std(heights.column('Fitted Value')) ** 2
np.std(residuals(heights, 'MidParent', 'Child')) ** 2 + np.std(heights.column('Fitted Value')) ** 2
The above comes from the variance decomposition:
$$
\frac{\mbox{Variance of residuals}}{\mbox{Variance of }y} ~+~ \frac{\mbox{Variance of fitted values}}{\mbox{Variance of }y} = r^2 + (1-r^2) = 1,
$$
which is leads to:
$$
\mbox{Variance of residuals} ~+~ \mbox{Variance of fitted values} = \mbox{Variance of }y
$$
np.std(dugong.column('Age')) ** 2
np.std(fitted_values(dugong, 'Length', 'Age')) ** 2
np.std(residuals(dugong, 'Length', 'Age')) ** 2
np.std(fitted_values(dugong, 'Length', 'Age')) ** 2 + np.std(residuals(dugong, 'Length', 'Age')) ** 2
r = correlation(heights, 'MidParent', 'Child')
r
np.sqrt(1 - r**2) * np.std(heights.column('Child'))
np.std(residuals(heights, 'MidParent', 'Child'))
np.std(residuals(hybrid, 'acceleration', 'mpg'))
r = correlation(hybrid, 'acceleration', 'mpg')
r
np.sqrt(1 - r**2)*np.std(hybrid.column('mpg'))