from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
def standard_units(arr):
return (arr - np.average(arr))/np.std(arr)
def correlation(t, x, y):
x_standard = standard_units(t.column(x))
y_standard = standard_units(t.column(y))
return np.average(x_standard * y_standard)
def slope(t, x, y):
r = correlation(t, x, y)
y_sd = np.std(t.column(y))
x_sd = np.std(t.column(x))
return r * y_sd / x_sd
def intercept(t, x, y):
x_mean = np.mean(t.column(x))
y_mean = np.mean(t.column(y))
return y_mean - slope(t, x, y)*x_mean
def fitted_values(t, x, y):
"""Return an array of the regression estimates at all the x values"""
a = slope(t, x, y)
b = intercept(t, x, y)
return a*t.column(x) + b
def residuals(t, x, y):
predictions = fitted_values(t, x, y)
return t.column(y) - predictions
def bootstrap_slope(t, x, y, repetitions=5000):
# Bootstrap the scatter, find the slope, collect
slopes = make_array()
for i in np.arange(repetitions):
bootstrap_sample = t.sample()
bootstrap_slope = slope(bootstrap_sample, x, y)
slopes = np.append(slopes, bootstrap_slope)
# Find the endpoints of the 95% confidence interval for the true slope
left = percentile(2.5, slopes)
right = percentile(97.5, slopes)
# Slope of the regression line from the original sample
observed_slope = slope(t, x, y)
# Display results
Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
print('Slope of regression line:', observed_slope)
print('Approximate 95%-confidence interval for the slope of the true line:')
print(left, 'to', right)
baby = Table.read_table('baby.csv')
baby.show(5)
slope(baby, 'Maternal Age', 'Birth Weight')
baby.scatter('Maternal Age', 'Birth Weight', fit_line=True)
bootstrap_slope(baby, 'Maternal Age', 'Birth Weight', 1000)
#ckd = chronic kidney disease
#class = 1 = has ckd
#class = 0 = does not have ckd
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)
ckd.group('Class')
#Could you predict if a patient has ckd?
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')
#Can you tell if a bank note is counterfeit or legitimate?
#Variables based on photgraphs of many banknotes (a few numbers for each image calculated)
banknotes = Table.read_table('banknote.csv')
banknotes
banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')
banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')
#Two attributes have some overlap of classes...what happens with three attributes?
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'),
banknotes.column('WaveletVar'),
banknotes.column('WaveletCurt'),
c=banknotes.column('Class'),
cmap='viridis',
s=50);