#!/usr/bin/env python # coding: utf-8 # In[ ]: # HIDDEN from datascience import * import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plots plots.style.use('fivethirtyeight') # ## Lecture 30 # In[ ]: def r_table(r, num_points=1000): """ Generate a table of N data points with a correlation approximately r """ np.random.seed(8) x = np.random.normal(0, 1, num_points) z = np.random.normal(0, 1, num_points) y = r*x + (np.sqrt(1-r**2))*z return Table().with_columns('x', x, 'y', y) # In[ ]: def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'): y = x*slope + intercept plots.plot(x, y, color=color) # In[ ]: def draw_vertical_line(x_position, color='black'): x = make_array(x_position, x_position) y = make_array(-4, 4) plots.plot(x, y, color=color) # In[ ]: def resize_window(lim=3.5): plots.xlim(-lim, lim) plots.ylim(-lim, lim) # In[ ]: #Ecological correlations sat2014 = Table.read_table('sat2014.csv').sort('State') sat2014 # In[ ]: sat2014.scatter('Critical Reading', 'Math') # In[ ]: #Example dataset with high correlation example = r_table(0.99) example # In[ ]: example.scatter('x', 'y') # resize_window() # In[ ]: #Function to predict values of y for a given x def predict_y(x_val): """ Predicts y-values for the example table """ nearby_points = example.where('x', are.between(x_val-0.25, x_val + 0.25)) return np.mean(nearby_points.column('y')) # In[ ]: example = example.with_column('Predicted y', example.apply(predict_y, 'x')) # In[ ]: #Visualize predicted values example.scatter('x') resize_window() # In[ ]: #Blue line matches our predicted points example.scatter('x') draw_line(slope=1, intercept=0, color='dodgerblue') resize_window() # In[ ]: #Example with correlation of 0 example = r_table(0) example.scatter('x', 'y') resize_window() # In[ ]: #Predictions produce approximately horizontal line example = example.with_column('Predicted y', example.apply(predict_y, 'x')) example.scatter('x') resize_window() # In[ ]: #Example with correlation of 0.5 ### which line follows the pattern? example = r_table(0.5) example.scatter('x', 'y') resize_window() draw_vertical_line(1.5) draw_line(slope=1, intercept=0) draw_line(slope=0.2, intercept=0, color='dodgerblue') # In[ ]: #Nearest neighbors predictions example = example.with_column('Predicted y', example.apply(predict_y, 'x')) example.scatter('x') resize_window() # In[ ]: #Blue line matches our predictions well example.scatter('x') draw_line(slope=1, intercept=0, color='red') draw_line(slope=0.5, intercept=0, color='dodgerblue') resize_window() # In[ ]: ################################################ def standard_units(arr): return (arr - np.average(arr))/np.std(arr) def correlation(t, x, y): x_standard = standard_units(t.column(x)) y_standard = standard_units(t.column(y)) return np.average(x_standard * y_standard) def slope(t, x, y): r = correlation(t, x, y) y_sd = np.std(t.column(y)) x_sd = np.std(t.column(x)) return r * y_sd / x_sd def intercept(t, x, y): x_mean = np.mean(t.column(x)) y_mean = np.mean(t.column(y)) return y_mean - slope(t, x, y)*x_mean # In[ ]: galton = Table.read_table('galton.csv') heights = Table().with_columns( 'MidParent', galton.column('midparentHeight'), 'Child', galton.column('childHeight')) heights # In[ ]: def predict_child(h): """Return a prediction of the height of a child whose parents have a midparent height of h. The prediction is the average height of the children whose midparent height is in the range h plus or minus 0.25 inches. """ close_points = heights.where('MidParent', are.between(h-0.5, h + 0.5)) return close_points.column('Child').mean() # In[ ]: heights_with_predictions = heights.with_column( 'Average neighbor prediction', heights.apply(predict_child, 'MidParent')) # In[ ]: galton_slope = slope(heights, 'MidParent', 'Child') galton_intercept = intercept(heights, 'MidParent', 'Child') galton_slope, galton_intercept # In[ ]: heights.take(123) # In[ ]: galton_slope*heights.take(123)[0] + galton_intercept # In[ ]: heights_with_predictions.where('MidParent', are.equal_to(69.48)) # In[ ]: heights_with_predictions = heights_with_predictions.with_column( 'Regression Prediction', galton_slope*heights.column('MidParent') + galton_intercept ) heights_with_predictions # In[ ]: heights_with_predictions.scatter('MidParent') # In[ ]: demographics = Table.read_table('district_demographics2016.csv') demographics # In[ ]: demographics.scatter('College%', 'Median Income') # In[ ]: demographics_slope = slope(demographics, 'College%', 'Median Income') demographics_intercept = intercept(demographics, 'College%', 'Median Income') (demographics_slope, demographics_intercept) # In[ ]: demographics.scatter('College%', 'Median Income') draw_line(slope=demographics_slope, intercept=demographics_intercept, x=make_array(0, 80)) # In[ ]: