from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
galton = Table.read_table('galton.csv')
heights = galton.select(3, 7).relabeled(0, 'MidParent').relabeled(1, 'Child')
heights
heights.scatter('MidParent', 'Child')
heights.scatter('MidParent', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);
nearby = heights.where('MidParent', are.between(67.5, 68.5))
nearby.column('Child').mean()
heights.scatter('MidParent', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, 66.24, color='gold', s=50);
def predict_child(h):
nearby = heights.where('MidParent', are.between(h - 0.5, h + 0.5))
return nearby.column('Child').mean()
predict_child(68)
predict_child(70)
predict_child(72)
predicted = heights.apply(predict_child, 'MidParent')
predicted
heights = heights.with_column('Predicted child', predicted)
heights
heights.scatter('MidParent')
temperatures = Table.read_table('temperatures.csv')
temperatures
temperatures.plot('Day')
temperatures.select('Low', 'High').hist(bins=np.arange(30, 105, 5))
temperatures.scatter('Low', 'High')
# Difference between high temp and low temp
def difference(x, y):
return x-y
difference(65, 54)
daily_spread = temperatures.apply(difference, 'High', 'Low')
temperatures = temperatures.with_column('Spread', daily_spread)
temperatures
temperatures.hist('Spread', bins=np.arange(0, 40, 4))
temperatures.where('Spread', are.above(20)).num_rows / temperatures.num_rows
Function with Optional Arguments
def percents(s, places):
return np.round(s/sum(s) * 100, places)
x = make_array(2, 5, 16)
percents(x, 4)
def percents(s, places=2):
return np.round(s/sum(s) * 100, places)
percents(x)
all_cones = Table.read_table('cones.csv')
all_cones
cones = all_cones.drop('Color').exclude(5)
cones
cones.group('Flavor')
cones.group('Flavor', min)
cones.group('Flavor', list)
cones.group('Flavor', np.average)
#Can get min price using .group
cones.group('Flavor', min)
#Or by computing using other table methods:
min(cones.where('Flavor', 'chocolate').column('Price'))
#User-defined function
def spread(arr):
return max(arr) - min(arr)
spread(make_array(7, 10, 2))
#Use your own function in .group
cones.group('Flavor', spread)
cones
all_cones
all_cones.group(['Flavor', 'Color'])
all_cones.group(['Flavor', 'Color'], np.average)
Examples
nba = Table.read_table('nba_salaries.csv').relabeled(3, 'SALARY')
nba
# total salary paid by each team, highest first
nba.select('TEAM', 'SALARY').group('TEAM', sum).sort('SALARY sum', descending=True)
nba.group('TEAM', sum)
# average salary paid for each position
nba.select('POSITION', 'SALARY').group('POSITION', np.average)
# for each team, average salary paid for each position
nba.drop('PLAYER').group(['TEAM', 'POSITION'], np.average)
all_cones
all_cones.group(['Flavor', 'Color'])
all_cones.pivot('Flavor', 'Color')
all_cones.pivot('Flavor', 'Color', values='Price', collect=np.average)
all_cones.pivot('Flavor', 'Color', values='Price', collect=list)
Examples
survey = Table.read_table('survey10.csv')
survey.show(3)
survey.pivot('love_at_first_sight', 'year')
survey.pivot('love_at_first_sight', 'super_power',values="number",collect=np.mean)
Which NBA teams spent the most on their “starters” in 2015-2016?
nba
# Let's first look at average salary per team per position
nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = np.average)
#If we assume the "starter" is the player paid the most in each position,
#we can use "collect = max"
nba_starters_salaries = nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = max)
#Now we need to find the total paid for the starters
totals = nba_starters_salaries.drop("TEAM").apply(sum)
#Add "totals" to our "nba_starters_salaries" table and sort by total
nba_starters_salaries.with_column("TOTAL", totals).sort("TOTAL",descending = True)