In [ ]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Lecture 9

Histogram Review

In [ ]:
#sleep = On average, how many hours of sleep do you get each night?
#tv = During a typical week, how many hours do you spend watching television?
#number = Pick a number between 0 and 9.
#study = During a typical week, how many hours do you spend studying?
#snow_white = Which of the Seven Dwarfs from Snow White are you most like?
survey = Table.read_table('survey09.csv')
survey.show(5)
In [ ]:
survey.hist('number', bins=np.arange(0, 9))
In [ ]:
study = survey.column("studying")
In [ ]:
[min(study), max(study)]
In [ ]:
survey.hist('studying', bins=np.arange(0, 81, 10))
In [ ]:
survey.hist('studying', bins=np.arange(0, 51, 5))
In [ ]:
survey.hist('studying', bins=np.arange(0, 51, 2))
In [ ]:
survey.hist('tv', bins=15)
In [ ]:
survey.hist('sleep', bins=5)
In [ ]:
seven = survey.group('snow_white')
seven.barh('snow_white')
In [ ]:
 
In [ ]:
galton = Table.read_table('galton.csv')
In [ ]:
#Each row corresponds to one adult child
#family = family indicator
#father height (inches) 
#mother height (inches) 
#"midparent height"= weighted average of parents' heights
#children= # of children in the family
#childNum = child's birth rank (1 = oldest)
#gender
#height (inches)
galton
In [ ]:
heights = galton.select(3, 7).relabeled(0, 'MidParent').relabeled(1, 'Child')
In [ ]:
heights
In [ ]:
my_bins = np.arange(55, 80, 2)
In [ ]:
#Histogram of child heights
heights.hist('Child', bins = my_bins, unit='Inch')
In [ ]:
#Percentage of heights between 65 and 67
heights.where('Child', are.between(65, 67)).num_rows / heights.num_rows
In [ ]:
#Histogram of parent heights
heights.hist('MidParent', bins=my_bins, unit='inch')
In [ ]:
#Combined histogram
heights.hist(bins=my_bins, unit='inch')

Functions

In [ ]:
def double(x):
    return x * 2
In [ ]:
double(7)
In [ ]:
double(15/3)
In [ ]:
my_number = 12
In [ ]:
double(my_number)
In [ ]:
double(my_number / 8)
In [ ]:
double(make_array(3, 4, 5))
In [ ]:
double('data')
In [ ]:
#"local scope"
x
In [ ]:
x = 17
In [ ]:
double(2)
In [ ]:
x
In [ ]:
double(x)
In [ ]:
x

Discussion Question

In [ ]:
#What does this function do?
def percents(values):
    return np.round(values / sum(values) * 100, 2)
In [ ]:
percents(make_array(1, 2, 3, 4))
In [ ]:
percents(make_array(1, 4, 30))
In [ ]:
#Can have multiple inputs
def percents(values, places):
    return np.round(values / sum(values) * 100, places)
In [ ]:
percents(make_array(1, 4, 30), 1)

Apply

In [ ]:
ages = Table().with_columns(
    'Person', make_array('A', 'B', 'C', 'D'),
    'Age', make_array(63, 110, 99, 102)
)
ages
In [ ]:
def cut_off_at_100(z):
    return min(z, 100)
In [ ]:
cut_off_at_100(3)
In [ ]:
cut_off_at_100(107)
In [ ]:
cut_age_array = ages.apply(cut_off_at_100, 'Age')
cut_age_array
In [ ]:
ages.with_column('Cut off ages', cut_age_array)
In [ ]:
type(cut_off_at_100)

Prediction

In [ ]:
heights
In [ ]:
heights.scatter('MidParent', 'Child')
In [ ]:
heights.scatter('MidParent', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);
In [ ]:
nearby = heights.where('MidParent', are.between(67.5, 68.5))
nearby.column('Child').mean()
In [ ]:
heights.scatter('MidParent', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, 66.24, color='gold', s=75);
In [ ]:
def predict_child(h):
    nearby = heights.where('MidParent', are.between(h-0.5, h+0.5))
    return nearby.column('Child').mean()
In [ ]:
predict_child(68)
In [ ]:
predict_child(65)
In [ ]:
predictions = heights.apply(predict_child, 'MidParent')
In [ ]:
heights = heights.with_column('Child Prediction', predictions)
In [ ]:
heights
In [ ]:
heights.scatter('MidParent')
In [ ]: