from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
all_cones = Table.read_table('cones.csv')
all_cones
all_cones.group(['Flavor', 'Color'])
all_cones.pivot('Flavor', 'Color')
all_cones.pivot('Color', 'Flavor')
all_cones.pivot('Color', 'Flavor', values = 'Price', collect = list)
nba = Table.read_table('nba_salaries.csv').relabeled("'15-'16 SALARY", 'SALARY')
nba
# for each team, average salary paid for each position
nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = np.average)
# for each team,
# amount paid to "starter" (player earning the most) in each position
starters = nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = max)
starters
# sort teams by total amount paid to starters
totals = starters.drop('TEAM').apply(sum)
starters.with_column('TOTAL', totals).sort('TOTAL', descending=True)
#Make table 1
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
['Milk Tea', 'Tea One', 4],
['Espresso', 'Nefeli', 2],
['Latte', 'Nefeli', 3],
['Espresso', "Abe's", 2]
])
drinks
#Make table 2
discounts = Table().with_columns(
'Coupon % off', make_array(25, 50, 5),
'Location', make_array('Tea One', 'Nefeli', 'Tea One')
)
discounts
combined = drinks.join('Cafe', discounts, 'Location')
combined
discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/ 100)
combined.with_column('Discounted', discounted_prices)
drinks
#Join table with itself
two = drinks.join('Cafe', drinks)
two
two.with_column('Total', two.column('Price') + two.column('Price_2'))
There was recently discussion about whether women tennis players get penalized more than men, for coaching violations. Let's look at some data.
# Data on number of code violations at tennis Grand Slam tournaments, 1998-2018
men_violations = Table.read_table('tennis_men_penalties.csv')
women_violations = Table.read_table('tennis_women_penalties.csv')
men_violations
women_violations
combined = men_violations.join('Violation', women_violations)
combined
combined.barh('Violation')
men_sets = 460
women_sets = 283
normalized = combined.with_columns(
'Rate (Men)', combined.column('Count (Men)') / men_sets,
'Rate (Women)', combined.column('Count (Women)') / women_sets
)
normalized.drop('Count (Men)', 'Count (Women)').barh('Violation')
###-----Bike rentals in Bay Area from 9/2014-8/2015
#Trip ID = ID for the rental
#Duration = duration of rental (seconds)
#Start date = start date of rental
#Start Station = Name of the Start Station
#Start Terminal = code for Start Terminal
#End Date = end date of rental
#End Station = Name of the End Station
#End Terminal = code for End Terminal
#Bike # = A serial number for the bike
#Subscriber Type = "Customer" or "Subscriber"
#Zip code
trips = Table.read_table('trip.csv')
trips
# Trips < 1800 seconds (30 min) are free!
commute = trips.where('Duration', are.below(1800))
commute.hist('Duration')
commute.hist('Duration', bins=60, unit='second')
#Most trips between the 250 and 550 range
#Rough estimate of percent between 250 and 550 minutes
(550-250) * 0.15
#Smaller/more bins, but general shape of the distributions stays the same
commute.hist('Duration', bins=np.arange(1801), unit='second')
#What is the most frequently used start station?
starts = commute.group('Start Station').sort('count', descending=True)
starts
San Francisco Caltrain Station on Townsend and 4th - place where people who take the train into the city can grab a bike to get to work, etc.
#Pivot can be used to investigate between which stations people are traveling
commute.pivot('Start Station', 'End Station')
#Look at durations between start and end stations
duration = trips.select('Start Station', 'End Station', 'Duration')
duration
#Use the group method and find the minimum duration
#One trip started and ended at 2nd at Folsom
shortest = duration.group(['Start Station', 'End Station'], min)
shortest
#Civic Center Bart (7th at Market) is a popular location
#We can investigate where people are headed from there, along with the shortest trip duration
from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Duration min')
from_cc
#Locations of each bike station
stations = Table.read_table('station.csv').relabel("name","labels")
stations
#Draw map where stations are located
Marker.map_table(stations.select('lat', 'long', 'labels'))
sf = stations.where('landmark', 'San Francisco')
Circle.map_table(sf.select('lat', 'long', 'labels'), color='green', radius=50)
#Color the markers according to the landmark
colors = stations.group('landmark').with_column(
'colors', make_array('blue', 'red', 'green', 'orange', 'purple'))
colors
stations.join('landmark', colors)
colored = stations.join('landmark', colors).select('lat', 'long', 'labels', 'colors')
Marker.map_table(colored)
#Recall the counts per Start Station
starts
station_starts = stations.join('labels', starts, 'Start Station')
station_starts
Circle.map_table(station_starts.select('lat', 'long', 'labels').with_columns(
'colors', 'blue',
'areas', station_starts.column('count')
))