In [ ]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Pivot Tables

In [ ]:
all_cones = Table.read_table('cones.csv')
In [ ]:
all_cones
In [ ]:
all_cones.group(['Flavor', 'Color'])
In [ ]:
all_cones.pivot('Flavor', 'Color')
In [ ]:
all_cones.pivot('Color', 'Flavor')
In [ ]:
all_cones.pivot('Color', 'Flavor', values = 'Price', collect = list)

Pivot Examples

In [ ]:
nba = Table.read_table('nba_salaries.csv').relabeled("'15-'16 SALARY", 'SALARY')
nba
In [ ]:
# for each team, average salary paid for each position

nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = np.average)
In [ ]:
# for each team,
# amount paid to "starter" (player earning the most) in each position

starters = nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = max)
starters
In [ ]:
# sort teams by total amount paid to starters

totals = starters.drop('TEAM').apply(sum)
In [ ]:
starters.with_column('TOTAL', totals).sort('TOTAL', descending=True)
In [ ]:
 

Join Examples

In [ ]:
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Tea One', 4],
    ['Espresso', 'Nefeli',  2],
    ['Latte',    'Nefeli',  3],
    ['Espresso', "Abe's",   2]
])
drinks
In [ ]:
discounts = Table().with_columns(
    'Coupon % off', make_array(25, 50, 5),
    'Location', make_array('Tea One', 'Nefeli', 'Tea One')
)
discounts
In [ ]:
combined = drinks.join('Cafe', discounts, 'Location')
combined
In [ ]:
discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/ 100)
combined.with_column('Discounted', discounted_prices)
In [ ]:
drinks
In [ ]:
two = drinks.join('Cafe', drinks)
two
In [ ]:
two.with_column('Total', two.column('Price') + two.column('Price_2'))

Join Examples: Tennis

There was recently discussion about whether women tennis players get penalized more than men, for coaching violations. Let's look at some data.

In [ ]:
# Data on number of code violations at tennis Grand Slam tournaments, 1998-2018
men_violations = Table.read_table('tennis_men_penalties.csv')
women_violations = Table.read_table('tennis_women_penalties.csv')
In [ ]:
men_violations
In [ ]:
women_violations
In [ ]:
combined = men_violations.join('Violation', women_violations)
combined
In [ ]:
combined.barh('Violation')
In [ ]:
men_sets = 460
women_sets = 283
normalized = combined.with_columns(
    'Rate (Men)', combined.column('Count (Men)') / men_sets,
    'Rate (Women)', combined.column('Count (Women)') / women_sets
)
normalized.drop('Count (Men)', 'Count (Women)').barh('Violation')

While we're on the subject of tennis, You can view stats on the top men players, sorted by how often they win when they are serving -- see here. The rankings of top 10 players are available here. They can be converted these into a spreadsheet using a handy web service.

In [ ]:
top_men = Table.read_table('tennis_men_ranks.csv')
men_serving = Table.read_table('tennis_men_serving.csv')
In [ ]:
top_men
In [ ]:
men_serving
In [ ]:
men2 = top_men.join('Name', men_serving.select('Name', 'Percentage'))
men2
In [ ]:
height = Table.read_table('tennis_men_heights.csv')
height
In [ ]:
men3 = men2.join('Name', height)
men3
In [ ]:
men3.scatter('Height', 'Percentage')

Bikes

In [ ]:
trips = Table.read_table('trip.csv')
trips
In [ ]:
commute = trips.where('Duration', are.below(1800))
commute.hist('Duration')
In [ ]:
commute.hist('Duration', bins=60, unit='second')
In [ ]:
(550-250) * 0.15
In [ ]:
commute.hist('Duration', bins=np.arange(1801), unit='second')
In [ ]:
starts = commute.group('Start Station').sort('count', descending=True)
starts
In [ ]:
commute.pivot('Start Station', 'End Station')
In [ ]:
duration = trips.select('Start Station', 'End Station', 'Duration')
duration
In [ ]:
shortest = duration.group(['Start Station', 'End Station'], min)
shortest
In [ ]:
from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Duration min')
from_cc

Maps

In [ ]:
stations = Table.read_table('station.csv')
stations
In [ ]:
Marker.map_table(stations.select('lat', 'long', 'name'))
In [ ]:
sf = stations.where('landmark', 'San Francisco')
Circle.map_table(sf.select('lat', 'long', 'name'), color='green', radius=150)
In [ ]:
from_cc
In [ ]:
colors = stations.group('landmark').with_column(
    'color', make_array('blue', 'red', 'green', 'orange', 'purple'))
colors
In [ ]:
colored = stations.join('landmark', colors).select('lat', 'long', 'name', 'color')
Marker.map_table(colored)
In [ ]:
station_starts = stations.join('name', starts, 'Start Station')
station_starts
In [ ]:
Circle.map_table(station_starts.select('lat', 'long', 'name').with_columns(
    'color', 'blue',
    'area', station_starts.column('count') * 1000
))
In [ ]: