#!/usr/bin/env python
# coding: utf-8

# In[ ]:


# HIDDEN
from datascience import *
import numpy as np
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


# ## Lecture 30

# In[ ]:


def r_table(r, num_points=1000):
    """
    Generate a table of N data points with a correlation approximately r
    """
    np.random.seed(8)
    x = np.random.normal(0, 1, num_points)
    z = np.random.normal(0, 1, num_points)
    y = r*x + (np.sqrt(1-r**2))*z
    return Table().with_columns('x', x, 'y', y)


# In[ ]:


def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'):
    y = x*slope + intercept
    plots.plot(x, y, color=color)


# In[ ]:


def draw_vertical_line(x_position, color='black'):
    x = make_array(x_position, x_position)
    y = make_array(-4, 4)
    plots.plot(x, y, color=color)


# In[ ]:


def resize_window(lim=3.5):
    plots.xlim(-lim, lim)
    plots.ylim(-lim, lim)


# In[ ]:


#Ecological correlations
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014


# In[ ]:


sat2014.scatter('Critical Reading', 'Math')


# In[ ]:


#Example dataset with high correlation
example = r_table(0.99)
example


# In[ ]:


example.scatter('x', 'y')
# resize_window()


# In[ ]:


#Function to predict values of y for a given x
def predict_y(x_val):
    """
    Predicts y-values for the example table 
    """
    nearby_points = example.where('x', are.between(x_val-0.25, x_val + 0.25))
    return np.mean(nearby_points.column('y'))


# In[ ]:


example = example.with_column('Predicted y', example.apply(predict_y, 'x'))


# In[ ]:


#Visualize predicted values
example.scatter('x')
resize_window()


# In[ ]:


#Blue line matches our predicted points
example.scatter('x')
draw_line(slope=1, intercept=0, color='dodgerblue')
resize_window()


# In[ ]:


#Example with correlation of 0
example = r_table(0)
example.scatter('x', 'y')
resize_window()


# In[ ]:


#Predictions produce approximately horizontal line
example = example.with_column('Predicted y', example.apply(predict_y, 'x'))
example.scatter('x')
resize_window()


# In[ ]:


#Example with correlation of 0.5
### which line follows the pattern?
example = r_table(0.5)
example.scatter('x', 'y')
resize_window()
draw_vertical_line(1.5)
draw_line(slope=1, intercept=0)
draw_line(slope=0.2, intercept=0, color='dodgerblue')


# In[ ]:


#Nearest neighbors predictions
example = example.with_column('Predicted y', example.apply(predict_y, 'x'))
example.scatter('x')
resize_window()


# In[ ]:


#Blue line matches our predictions well
example.scatter('x')
draw_line(slope=1, intercept=0, color='red')
draw_line(slope=0.5, intercept=0, color='dodgerblue')
resize_window()


# In[ ]:


################################################
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd


def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean


# In[ ]:


galton = Table.read_table('galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight'))
heights


# In[ ]:


def predict_child(h):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of h.
    
    The prediction is the average height of the children 
    whose midparent height is in the range h plus or minus 0.25 inches.
    """
    
    close_points = heights.where('MidParent', are.between(h-0.5, h + 0.5))
    return close_points.column('Child').mean()   


# In[ ]:


heights_with_predictions = heights.with_column(
    'Average neighbor prediction', heights.apply(predict_child, 'MidParent'))


# In[ ]:


galton_slope = slope(heights, 'MidParent', 'Child')
galton_intercept = intercept(heights, 'MidParent', 'Child')
galton_slope, galton_intercept


# In[ ]:


heights.take(123)


# In[ ]:


galton_slope*heights.take(123)[0] + galton_intercept


# In[ ]:


heights_with_predictions.where('MidParent', are.equal_to(69.48))


# In[ ]:


heights_with_predictions = heights_with_predictions.with_column(
    'Regression Prediction', galton_slope*heights.column('MidParent') + galton_intercept
)
heights_with_predictions


# In[ ]:


heights_with_predictions.scatter('MidParent')


# In[ ]:


demographics = Table.read_table('district_demographics2016.csv')
demographics


# In[ ]:


demographics.scatter('College%', 'Median Income')


# In[ ]:


demographics_slope = slope(demographics, 'College%', 'Median Income')
demographics_intercept = intercept(demographics, 'College%', 'Median Income')
(demographics_slope, demographics_intercept)


# In[ ]:


demographics.scatter('College%', 'Median Income')
draw_line(slope=demographics_slope, intercept=demographics_intercept, x=make_array(0, 80))


# In[ ]: