In [ ]:

from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter('ignore')

Billionaires¶

In [ ]:

billionaires = Table.read_table('billionaires.csv')
billionaires

In [ ]:

(billionaires.group('citizenship')
             .sort(1, descending=True)
             .barh('citizenship'))

In [ ]:

(billionaires.group('industry')
             .sort(1, descending=True)
             .barh('industry'))

In [ ]:

billionaires.pivot('industry', 'citizenship').show()

In [ ]:

countries = Table.read_table('countries.csv')
countries

In [ ]:

countries.where('Continent', 'North America')

In [ ]:

billionaires = billionaires.join('citizenship', countries, 'Country')
billionaires

In [ ]:

billionaires.pivot('industry', 'Continent')

In [ ]:

ceos = Table.read_table('ceo.csv')
ceos

In [ ]:

ceos.hist('amount')

In [ ]:

ceos.sort('amount', descending=True)

In [ ]:

ceos.sort('amount', descending=False)

In [ ]:

ceos.hist('amount', bins=np.arange(0, 1e5, 1e3), unit='$1000')

In [ ]:

b_donations = billionaires.join('name', ceos, 'ceo')
b_donations.show()

In [ ]:

b_donations.where('amount', are.below(6e6)).scatter('net worth', 'amount')

In [ ]:

donation_prop = b_donations.column('amount') / b_donations.column('net worth')
b_donations = b_donations.with_column('Contribution%', donation_prop)
b_donations = b_donations.sort('amount', descending=True)
b_donations.set_format('Contribution%', PercentFormatter)
b_donations.show(20)

In [ ]:

.02 / 100 * 20000

Meteorites¶

In [ ]:

meteorites = Table.read_table('meteorites_na.csv')
meteorites

In [ ]:

meteorites.group('class').sort(1, descending=True)

In [ ]:

meteorites.group('how')

In [ ]:

meteorites.group('how', np.average)

In [ ]:

meteorites.hist('year', bins=np.arange(1800, 2020, 5), group='how')

In [ ]:

meteorites.scatter('year', 'mass')

In [ ]:

meteorites.where('mass', are.below(1e5)).scatter('year', 'mass')

In [ ]:

new_fell = meteorites.where('how', 'Fell').where('year', are.above(1950))
new_found  = meteorites.where('how', 'Found').where('year', are.above(1950))
new_found

Maps¶

In [ ]:

Marker.map_table(new_fell.select('lat', 'lon', 'name'))

In [ ]:

Circle.map_table(new_found.select('lat', 'lon', 'name'), radius=10)

Bikes¶

If you loaded the meteorites dataset above, now might be a good time to restart your kernel, run the cell at the top of the notebook, then come back here.

The bike datasets in this section is big, and could lead to kernel crashes.

In [ ]:

trip = Table.read_table('trip.csv')
trip

In [ ]:

commute = trip.where('Duration', are.below(1800))
commute.hist('Duration')

In [ ]:

commute.hist('Duration', bins=60, unit='second')

In [ ]:

# Percent of people who have a ride duration between 500 and 250 seconds
(500-250) * 0.15 

In [ ]:

starts = commute.group('Start Station').sort('count', descending=True)
starts

In [ ]:

commute.pivot('Start Station', 'End Station')

In [ ]:

duration = trip.select('Start Station', 'End Station', 'Duration')
duration

In [ ]:

shortest = duration.group(['Start Station', 'End Station'], min).relabeled('Duration min', 'Minimum Duration')
shortest

In [ ]:

from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Minimum Duration')
from_cc

Maps, again¶

In [ ]:

stations = Table.read_table('station.csv')
stations

In [ ]:

Marker.map_table(stations.select('lat', 'long', 'name'))

In [ ]:

sf = stations.where('landmark', 'San Francisco')
Circle.map_table(sf.select('lat', 'long', 'name'), color='green', radius=15)

Extra practice¶

Write a Python expression below each of the following descriptions that computes its value. You may use more than one line.¶

In [ ]:

# The name of the station where the most rentals ended (assume no ties).

In [ ]:

# The number of stations for which the average duration ending at that station was more than 10 minutes.

In [ ]:

# The number of stations that have more than 500 starts AND more than 500 ends

In [ ]:

# The name of the station where the most rentals ended (assume no ties).
# First, find end counts
# Then, find the station with the highest end count
trip.group('End').sort('count', descending=True).column(0).item(0)

In [ ]:

# The number of stations for which the average duration ending at that station was more than 10 minutes.

# First, find the average end time for each station
# Then, keep the ones above 10 minutes
# Then, count them
trip.group('End', np.average).where(2, are.above(10*60)).num_rows

In [ ]:

# The number of stations that have more than 500 starts AND more than 500 ends.

# First, group the trip on starts and ends, separately
# Then, join the two grouped tables
# Then, count the number of rows where the stations' start and end counts are bove 500
start_counts, end_counts = trip.group('Start').relabeled('count', 'Start Count'), trip.group('End').relabeled('count', 'End Count')
start_counts.join('Start', end_counts, 'End').where(
    'Start Count', are.above(500)).where('End Count', are.above(500)).num_rows