from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
#United Airlines domestic flights
# departing from San Francisco in the summer of 2015
united = Table.read_table('united_summer2015.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united
#Deterministic sample
united.take(make_array(999, 1000, 1001))
#Systematic sample
start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()
#Options for a six-side die
die = Table().with_column('Face', np.arange(1, 7))
die
#Table method .sample() draws at random with replacement
# from the rows of a table (optional argument: with_replacement=False)
die.sample(10)
#Setup bins for the die options (middle of bin are integers)
roll_bins = np.arange(0.5, 6.6, 1)
roll_bins
#Theoretical distribution (equal chance for each outcome)
die.hist(bins = roll_bins)
#Empirical distribution based on sample of size 10
die.sample(10).hist(bins = roll_bins)
#Another empirical distribution based on sample of size 10
die.sample(10).hist(bins = roll_bins)
#And another (distributions change quite a bit)
die.sample(10).hist(bins = roll_bins)
#Increase sample size to 100
die.sample(100).hist(bins = roll_bins)
#Another empirical distribution with a sample size of 100
#These appear more stable than with a sample size of 10
die.sample(100).hist(bins = roll_bins)
#Sample size of 1000
die.sample(1000).hist(bins = roll_bins)
#Another with a sample size of 1000
#More stable than those with a sample size of 10 or 100
#Closer to the theoretical distribution (equal probabilities)
die.sample(1000).hist(bins = roll_bins)
#Let's get back to the united data
#Recall the column headers
united.show(5)
#Plot a histogram of the delays
united.hist('Delay', bins = np.arange(-20, 201, 10))
#We missed some values with the bins we specified
min(united.column('Delay')), max(united.column('Delay'))
#Let's see which row had the 580 minute delay
united.where('Delay', 580)
#Check how many were above 200
#We will focus on where the bulk of the data are and remove the 0.8% large values for now
united.where('Delay', are.above(200)).num_rows/united.num_rows
#united included all the domestic flights during the summer 2015 departing from San Francisco
#think of united as a population
#now we will draw samples from it and view the corresponding empirical distributions
united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))
#Another empirical distribution with a sample size of 10
united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))
#Empirical distribution with a sample size of 100
united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))
#Another empirical distribution with a sample size of 100
united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))
#Considering united as a population
#The population median delay is:
np.median(united.column('Delay'))
#Percentage of data less than or equal to the median
united.where('Delay', are.below_or_equal_to(2)).num_rows / united.num_rows
#What if we take a random sample of size 10 - what's the estimated median?
np.median(united.sample(10).column('Delay'))
#Simulate the empirical distribution of the median (statistic) using a sample size of 1000
#We generate 10000 samples of size 1000 (there will be 10000 estimates of the median)
#This cell takes a few seconds to run
medians = make_array()
for i in np.arange(10000):
new_median = np.median(united.sample(1000).column('Delay'))
medians = np.append(medians, new_median)
#Display the empirical distribution of the median as a histogram
Table().with_column('Sample Median', medians).hist(bins = np.arange(-0.5, 5.6, 1))