import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
united = Table.read_table('united_summer2015.csv')
united
united.hist('Delay', bins = np.arange(-20, 300, 10))
delays = united.column('Delay')
mean_delay = np.mean(delays)
sd_delay = np.std(delays)
mean_delay, sd_delay
percentile(50, delays)
sample_size = 400
means_400 = make_array()
for i in np.arange(10000):
sampled_flights = united.sample(sample_size)
sample_mean = np.mean(sampled_flights.column('Delay'))
means_400 = np.append(means_400, sample_mean)
Table().with_columns('Sample Mean', means_400).hist(bins = 20)
plots.title('Sample Size ' + str(sample_size))
plots.xlabel('Sample Average')
print('Population Average: ', mean_delay);
np.average(means_400)
sample_size = 900
means_900 = make_array()
for i in np.arange(10000):
sampled_flights = united.sample(sample_size)
sample_mean = np.mean(sampled_flights.column('Delay'))
means_900 = np.append(means_900, sample_mean)
means_tbl = Table().with_columns(
'400', means_400,
'900', means_900
)
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');
#####################################################
"""Empirical distribution of random sample means"""
def sample_means(sample_size):
repetitions = 10000
means = make_array()
for i in range(repetitions):
sampled_flights = united.sample(sample_size)
sample_mean = np.mean(sampled_flights.column('Delay'))
means = np.append(means, sample_mean)
sample_means = Table().with_column('Sample Means', means)
# Display empirical histogram and print all relevant quantities
sample_means.hist(bins=20)
plots.xlabel('Sample Means')
plots.title('Sample Size ' + str(sample_size))
print("Sample size: ", sample_size)
print("Population mean:", np.mean(united.column('Delay')))
print("Average of sample means: ", np.mean(means))
print("Population SD:", np.std(united.column('Delay')))
print("SD of sample means:", np.std(means))
sample_means(100)
sample_means(400)
sample_means(625)
sd_delay, sd_delay / make_array(10, 20, 25)
sample_sizes = np.arange(50, 401, 50)
sd_of_sample_means = make_array()
for n in sample_sizes:
means = make_array()
for i in np.arange(10000):
means = np.append(means, np.mean(united.sample(n).column('Delay')))
sd_of_sample_means = np.append(sd_of_sample_means, np.std(means))
sd_comparison = Table().with_columns(
'Sample Size n', sample_sizes,
'SD of 10,000 Sample Means', sd_of_sample_means,
'Population_SD/sqrt(n)', sd_delay/np.sqrt(sample_sizes)
)
sd_comparison
sd_comparison.scatter('Sample Size n')