In [ ]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Lecture 15

Random Sampling

In [ ]:
#United Airlines domestic flights 
# departing from San Francisco in the summer of 2015
united = Table.read_table('united_summer2015.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
In [ ]:
united
In [ ]:
#Deterministic sample
united.take(make_array(999, 1000, 1001))
In [ ]:
#Systematic sample
start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()

Distributions

In [ ]:
#Options for a six-side die
die = Table().with_column('Face', np.arange(1, 7))
die
In [ ]:
#Table method .sample() draws at random with replacement 
# from the rows of a table (optional argument: with_replacement=False)
die.sample(10)
In [ ]:
#Setup bins for the die options (middle of bin are integers)
roll_bins = np.arange(0.5, 6.6, 1)
roll_bins
In [ ]:
#Theoretical distribution (equal chance for each outcome)
die.hist(bins = roll_bins)
In [ ]:
#Empirical distribution based on sample of size 10
die.sample(10).hist(bins = roll_bins)

Large Random Samples

In [ ]:
#Another empirical distribution based on sample of size 10
die.sample(10).hist(bins = roll_bins)
In [ ]:
#And another (distributions change quite a bit)
die.sample(10).hist(bins = roll_bins)
In [ ]:
#Increase sample size to 100
die.sample(100).hist(bins = roll_bins)
In [ ]:
#Another empirical  distribution with a sample size of 100
#These appear more stable than with a sample size of 10
die.sample(100).hist(bins = roll_bins)
In [ ]:
#Sample size of 1000
die.sample(1000).hist(bins = roll_bins)
In [ ]:
#Another with a sample size of 1000
#More stable than those with a sample size of 10 or 100
#Closer to the theoretical distribution (equal probabilities)
die.sample(1000).hist(bins = roll_bins)
In [ ]:
#Let's get back to the united data
#Recall the column headers
united.show(5)
In [ ]:
#Plot a histogram of the delays
united.hist('Delay', bins = np.arange(-20, 201, 10))
In [ ]:
#We missed some values with the bins we specified
min(united.column('Delay')), max(united.column('Delay'))
In [ ]:
#Let's see which row had the 580 minute delay
united.where('Delay', 580)
In [ ]:
#Check how many were above 200
#We will focus on where the bulk of the data are and remove the 0.8% large values for now
united.where('Delay', are.above(200)).num_rows/united.num_rows
In [ ]:
#united included all the domestic flights during the summer 2015 departing from San Francisco 
#think of united as a population
#now we will draw samples from it and view the corresponding empirical distributions
united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))
In [ ]:
#Another empirical distribution with a sample size of 10
united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))
In [ ]:
#Empirical distribution with a sample size of 100
united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))
In [ ]:
#Another empirical distribution with a sample size of 100
united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))

Simulating Statistics

In [ ]:
#Considering united as a population
#The population median delay is:
np.median(united.column('Delay'))
In [ ]:
#Percentage of data less than or equal to the median
united.where('Delay', are.below_or_equal_to(2)).num_rows / united.num_rows
In [ ]:
#What if we take a random sample of size 10 - what's the estimated median?
np.median(united.sample(10).column('Delay'))
In [ ]:
#Simulate the empirical distribution of the median (statistic) using a sample size of 1000
#We generate 10000 samples of size 1000 (there will be 10000 estimates of the median)
#This cell takes a few seconds to run
medians = make_array()

for i in np.arange(10000):
    new_median = np.median(united.sample(1000).column('Delay'))
    medians = np.append(medians, new_median)
In [ ]:
#Display the empirical distribution of the median as a histogram
Table().with_column('Sample Median', medians).hist(bins = np.arange(-0.5, 5.6, 1))
In [ ]: