from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore')
billionaires = Table.read_table('billionaires.csv')
billionaires
(billionaires.group('citizenship')
.sort(1, descending=True)
.barh('citizenship'))
(billionaires.group('industry')
.sort(1, descending=True)
.barh('industry'))
billionaires.pivot('industry', 'citizenship').show()
countries = Table.read_table('countries.csv')
countries
countries.where('Continent', 'North America')
billionaires = billionaires.join('citizenship', countries, 'Country')
billionaires
billionaires.pivot('industry', 'Continent')
ceos = Table.read_table('ceo.csv')
ceos
ceos.hist('amount')
ceos.sort('amount', descending=True)
ceos.sort('amount', descending=False)
ceos.hist('amount', bins=np.arange(0, 1e5, 1e3), unit='$1000')
b_donations = billionaires.join('name', ceos, 'ceo')
b_donations.show()
b_donations.where('amount', are.below(6e6)).scatter('net worth', 'amount')
donation_prop = b_donations.column('amount') / b_donations.column('net worth')
b_donations = b_donations.with_column('Contribution%', donation_prop)
b_donations = b_donations.sort('amount', descending=True)
b_donations.set_format('Contribution%', PercentFormatter)
b_donations.show(20)
.02 / 100 * 20000
meteorites = Table.read_table('meteorites_na.csv')
meteorites
meteorites.group('class').sort(1, descending=True)
meteorites.group('how')
meteorites.group('how', np.average)
meteorites.hist('year', bins=np.arange(1800, 2020, 5), group='how')
meteorites.scatter('year', 'mass')
meteorites.where('mass', are.below(1e5)).scatter('year', 'mass')
new_fell = meteorites.where('how', 'Fell').where('year', are.above(1950))
new_found = meteorites.where('how', 'Found').where('year', are.above(1950))
new_found
Marker.map_table(new_fell.select('lat', 'lon', 'name'))
Circle.map_table(new_found.select('lat', 'lon', 'name'), radius=10)
If you loaded the meteorites dataset above, now might be a good time to restart your kernel, run the cell at the top of the notebook, then come back here.
The bike datasets in this section is big, and could lead to kernel crashes.
trip = Table.read_table('trip.csv')
trip
commute = trip.where('Duration', are.below(1800))
commute.hist('Duration')
commute.hist('Duration', bins=60, unit='second')
# Percent of people who have a ride duration between 500 and 250 seconds
(500-250) * 0.15
starts = commute.group('Start Station').sort('count', descending=True)
starts
commute.pivot('Start Station', 'End Station')
duration = trip.select('Start Station', 'End Station', 'Duration')
duration
shortest = duration.group(['Start Station', 'End Station'], min).relabeled('Duration min', 'Minimum Duration')
shortest
from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Minimum Duration')
from_cc
stations = Table.read_table('station.csv')
stations
Marker.map_table(stations.select('lat', 'long', 'name'))
sf = stations.where('landmark', 'San Francisco')
Circle.map_table(sf.select('lat', 'long', 'name'), color='green', radius=15)
# The name of the station where the most rentals ended (assume no ties).
# The number of stations for which the average duration ending at that station was more than 10 minutes.
# The number of stations that have more than 500 starts AND more than 500 ends
# The name of the station where the most rentals ended (assume no ties).
# First, find end counts
# Then, find the station with the highest end count
trip.group('End').sort('count', descending=True).column(0).item(0)
# The number of stations for which the average duration ending at that station was more than 10 minutes.
# First, find the average end time for each station
# Then, keep the ones above 10 minutes
# Then, count them
trip.group('End', np.average).where(2, are.above(10*60)).num_rows
# The number of stations that have more than 500 starts AND more than 500 ends.
# First, group the trip on starts and ends, separately
# Then, join the two grouped tables
# Then, count the number of rows where the stations' start and end counts are bove 500
start_counts, end_counts = trip.group('Start').relabeled('count', 'Start Count'), trip.group('End').relabeled('count', 'End Count')
start_counts.join('Start', end_counts, 'End').where(
'Start Count', are.above(500)).where('End Count', are.above(500)).num_rows