In [ ]:

# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Lecture 32¶

In [ ]:

galton = Table.read_table('galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )

In [ ]:

heights

In [ ]:

def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def residuals(t, x, y):
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions

In [ ]:

heights = heights.with_columns(
    'Fitted Value', fitted_values(heights, 'MidParent', 'Child'),
    'Residual', residuals(heights, 'MidParent', 'Child')
)
heights

In [ ]:

correlation(heights, 'MidParent', 'Child')

In [ ]:

heights.scatter('MidParent')

In [ ]:

def plot_residuals(t, x, y):
    tbl = t.with_columns(
        'Fitted', fitted_values(t, x, y),
        'Residual', residuals(t, x, y)
    )
    tbl.select(x, y, 'Fitted').scatter(0)
    tbl.scatter(x, 'Residual')

In [ ]:

plot_residuals(heights, 'MidParent', 'Child')

Diagnostics with Residuals¶

In [ ]:

# Length in meters
# Age in years
# Ages are estimated based on variables (e.g. condition of teeth)
dugong = Table.read_table('dugong.csv')
dugong.show(5)

In [ ]:

dugong.scatter('Length', 'Age')

In [ ]:

correlation(dugong, 'Length', 'Age')

In [ ]:

plot_residuals(dugong, 'Length', 'Age')

In [ ]:

# Height and average weight of US women
us_women = Table.read_table('us_women.csv')
us_women.show(5)

In [ ]:

correlation(us_women, 'height', 'ave weight')

In [ ]:

plot_residuals(us_women, 'height', 'ave weight')

In [ ]:

demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)

In [ ]:

correlation(demographics, 'Median Income', 'Percent voting for Clinton')

In [ ]:

plot_residuals(demographics, 'Median Income', 'Percent voting for Clinton')

In [ ]:

movies = Table.read_table('actors.csv')
movies.show(3)

In [ ]:

plot_residuals(movies, 'Number of Movies', 'Average per Movie')

In [ ]:

movies.sort("Average per Movie", descending = True)

Average of Residuals¶

In [ ]:

# Nonlinear
round(np.average(residuals(dugong, 'Length', 'Age')), 6)

In [ ]:

# Linear
round(np.average(residuals(heights, 'MidParent', 'Child')), 6)

In [ ]:

# Heteroscedasticity ("uneven spread")
round(np.average(residuals(demographics, 'Median Income', 'Percent voting for Clinton')), 6)

A Measure of Clustering¶

In [ ]:

def plot_fitted(t, x, y):
    tbl = t.select(x, y)
    tbl.with_columns('Fitted Value', fitted_values(t, x, y)).scatter(0)

In [ ]:

plot_fitted(heights, 'MidParent', 'Child')

In [ ]:

child_predictions_sd = np.std(fitted_values(heights, 'MidParent', 'Child'))
child_observed_sd = np.std(heights.column('Child'))
print(child_predictions_sd)
print(child_observed_sd)

In [ ]:

child_predictions_sd / child_observed_sd

In [ ]:

correlation(heights, 'MidParent', 'Child')

In [ ]:

correlation(dugong, 'Length', 'Age')

In [ ]:

dugong_prediction_sd = np.std(fitted_values(dugong, 'Length', 'Age'))
dugong_observed_sd = np.std(dugong.column(1))
dugong_prediction_sd / dugong_observed_sd

In [ ]:

hybrid = Table.read_table('hybrid.csv')
hybrid.show(5)

In [ ]:

plot_residuals(hybrid, 'acceleration', 'mpg')

In [ ]:

correlation(hybrid, 'acceleration', 'mpg')

In [ ]:

np.std(fitted_values(hybrid, 'acceleration', 'mpg'))/np.std(hybrid.column('mpg'))

No matter what the shape of the scatter plot, the SD of the fitted values is a fraction of the SD of the observed values of $y$. The fraction is |r|.

$$ \frac{\mbox{SD of fitted values}}{\mbox{SD of }y} ~=~ |r| ~~~~~~~~~~ \mbox{That is,} ~~ \mbox{SD of fitted values} = |r|\cdot \mbox{SD of }y $$

SD of the Residuals¶

No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is $\sqrt{1-r^2}$.

$$ \mbox{SD of residuals} ~=~ \sqrt{1 - r^2} \cdot \mbox{SD of }y $$

In [ ]:

plot_fitted(heights, 'MidParent', 'Child')

In [ ]:

plot_fitted(heights, 'MidParent', 'Child')
ave_child = np.mean(heights.column('Child'))
plots.plot([64, 76], [ave_child, ave_child]);

In [ ]:

np.std(heights.column('Child')) ** 2

In [ ]:

np.std(residuals(heights, 'MidParent', 'Child')) ** 2

In [ ]:

np.std(heights.column('Fitted Value')) ** 2

In [ ]:

np.std(residuals(heights, 'MidParent', 'Child')) ** 2 + np.std(heights.column('Fitted Value')) ** 2

The above comes from the variance decomposition: $$ \frac{\mbox{Variance of residuals}}{\mbox{Variance of }y} ~+~ \frac{\mbox{Variance of fitted values}}{\mbox{Variance of }y} = r^2 + (1-r^2) = 1, $$
which is leads to: $$ \mbox{Variance of residuals} ~+~ \mbox{Variance of fitted values} = \mbox{Variance of }y $$

In [ ]:

np.std(dugong.column('Age')) ** 2

In [ ]:

np.std(fitted_values(dugong, 'Length', 'Age')) ** 2

In [ ]:

np.std(residuals(dugong, 'Length', 'Age')) ** 2

In [ ]:

np.std(fitted_values(dugong, 'Length', 'Age')) ** 2 + np.std(residuals(dugong, 'Length', 'Age')) ** 2

In [ ]:

r = correlation(heights, 'MidParent', 'Child')
r

In [ ]:

np.sqrt(1 - r**2) * np.std(heights.column('Child'))

In [ ]:

np.std(residuals(heights, 'MidParent', 'Child'))

In [ ]:

np.std(residuals(hybrid, 'acceleration', 'mpg'))

In [ ]:

r = correlation(hybrid, 'acceleration', 'mpg')
r

In [ ]:

np.sqrt(1 - r**2)*np.std(hybrid.column('mpg'))

In [ ]: