In [ ]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Pivot Tables

In [ ]:
all_cones = Table.read_table('cones.csv')
In [ ]:
all_cones
In [ ]:
all_cones.group(['Flavor', 'Color'])
In [ ]:
all_cones.pivot('Flavor', 'Color')
In [ ]:
all_cones.pivot('Color', 'Flavor')
In [ ]:
all_cones.pivot('Color', 'Flavor', values = 'Price', collect = list)

Pivot Examples

In [ ]:
nba = Table.read_table('nba_salaries.csv').relabeled("'15-'16 SALARY", 'SALARY')
nba
In [ ]:
# for each team, average salary paid for each position

nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = np.average)
In [ ]:
# for each team,
# amount paid to "starter" (player earning the most) in each position

starters = nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = max)
starters
In [ ]:
# sort teams by total amount paid to starters

totals = starters.drop('TEAM').apply(sum)
In [ ]:
starters.with_column('TOTAL', totals).sort('TOTAL', descending=True)
In [ ]:
 

Join Examples

In [ ]:
#Make table 1
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Tea One', 4],
    ['Espresso', 'Nefeli',  2],
    ['Latte',    'Nefeli',  3],
    ['Espresso', "Abe's",   2]
])
drinks
In [ ]:
#Make table 2
discounts = Table().with_columns(
    'Coupon % off', make_array(25, 50, 5),
    'Location', make_array('Tea One', 'Nefeli', 'Tea One')
)
discounts
In [ ]:
combined = drinks.join('Cafe', discounts, 'Location')
combined
In [ ]:
discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/ 100)
combined.with_column('Discounted', discounted_prices)
In [ ]:
drinks
In [ ]:
#Join table with itself
two = drinks.join('Cafe', drinks)
two
In [ ]:
two.with_column('Total', two.column('Price') + two.column('Price_2'))

Join Examples: Tennis

There was recently discussion about whether women tennis players get penalized more than men, for coaching violations. Let's look at some data.

In [ ]:
# Data on number of code violations at tennis Grand Slam tournaments, 1998-2018
men_violations = Table.read_table('tennis_men_penalties.csv')
women_violations = Table.read_table('tennis_women_penalties.csv')
In [ ]:
men_violations
In [ ]:
women_violations
In [ ]:
combined = men_violations.join('Violation', women_violations)
combined
In [ ]:
combined.barh('Violation')
In [ ]:
men_sets = 460
women_sets = 283
normalized = combined.with_columns(
    'Rate (Men)', combined.column('Count (Men)') / men_sets,
    'Rate (Women)', combined.column('Count (Women)') / women_sets
)
normalized.drop('Count (Men)', 'Count (Women)').barh('Violation')

Bikes

In [ ]:
###-----Bike rentals in Bay Area from 9/2014-8/2015
#Trip ID = ID for the rental
#Duration = duration of rental (seconds)
#Start date = start date of rental
#Start Station = Name of the Start Station 
#Start Terminal = code for Start Terminal
#End Date = end date of rental
#End Station = Name of the End Station 
#End Terminal = code for End Terminal
#Bike # = A serial number for the bike
#Subscriber Type = "Customer" or "Subscriber"
#Zip code 

trips = Table.read_table('trip.csv')
trips
In [ ]:
# Trips < 1800 seconds (30 min) are free!
commute = trips.where('Duration', are.below(1800))
commute.hist('Duration')
In [ ]:
commute.hist('Duration', bins=60, unit='second')
In [ ]:
#Most trips between the 250 and 550 range
#Rough estimate of percent between 250 and 550  minutes
(550-250) * 0.15
In [ ]:
#Smaller/more bins, but general shape of the distributions stays the same
commute.hist('Duration', bins=np.arange(1801), unit='second')
In [ ]:
#What is the most frequently used start station?
starts = commute.group('Start Station').sort('count', descending=True)
starts

San Francisco Caltrain Station on Townsend and 4th - place where people who take the train into the city can grab a bike to get to work, etc.

In [ ]:
#Pivot can be used to investigate between which stations people are traveling
commute.pivot('Start Station', 'End Station')

Shortest trips

In [ ]:
#Look at durations between start and end stations
duration = trips.select('Start Station', 'End Station', 'Duration')
duration
In [ ]:
#Use the group method and find the minimum duration
#One trip started and ended at 2nd at Folsom
shortest = duration.group(['Start Station', 'End Station'], min)
shortest
In [ ]:
#Civic Center Bart (7th at Market) is a popular location
#We can investigate where people are headed from there, along with the shortest trip duration
from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Duration min')
from_cc

Maps

In [ ]:
#Locations of each bike station
stations = Table.read_table('station.csv').relabel("name","labels")
stations
In [ ]:
#Draw map where stations are located
Marker.map_table(stations.select('lat', 'long', 'labels'))
In [ ]:
sf = stations.where('landmark', 'San Francisco')
Circle.map_table(sf.select('lat', 'long', 'labels'), color='green', radius=50)
In [ ]:
#Color the markers according to the landmark
colors = stations.group('landmark').with_column(
    'colors', make_array('blue', 'red', 'green', 'orange', 'purple'))
colors
In [ ]:
stations.join('landmark', colors)
In [ ]:
colored = stations.join('landmark', colors).select('lat', 'long', 'labels', 'colors')
Marker.map_table(colored)
In [ ]:
#Recall the counts per Start Station
starts
In [ ]:
station_starts = stations.join('labels', starts, 'Start Station')
station_starts
In [ ]:
Circle.map_table(station_starts.select('lat', 'long', 'labels').with_columns(
    'colors', 'blue',
    'areas', station_starts.column('count')
))
In [ ]:
 
In [ ]: