from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
s = [1, 7, 3, 9, 5]
np.sort(s)
percentile(10, s)==0
percentile(39, s) == percentile(40, s)
percentile(40, s) == percentile(41, s)
percentile(50, s) == 5
x = make_array(43, 20, 51, 7, 28, 34)
y = np.sort(x)
y
.55 * len(x)
percentile(55, x)
.95 * len(x)
percentile(95, x)
.99 * len(x)
percentile(99, x)
# Compensation data for employees of the city of San Francisco in 2015
sf = Table.read_table('san_francisco_2015.csv')
sf
sf.where('Job', 'Mayor')
sf.sort('Total Compensation', descending=True).show(5)
sf.sort('Total Compensation', descending=False).show(5)
sf = sf.where('Total Compensation', are.above(10000))
sf.num_rows
sf_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=sf_bins)
pop_median = percentile(50, sf.column('Total Compensation'))
pop_median
our_sample = sf.sample(300, with_replacement=False)
our_sample.hist('Total Compensation', bins=sf_bins)
est_median = percentile(50, our_sample.column('Total Compensation'))
est_median
#################################
resample1 = our_sample.sample()
percentile(50, resample1.column('Total Compensation'))
def bootstrap_median(original_sample, label, replications):
"""Simulate sample median:
original_sample: table containing the original sample
label: label of column containing the variable
replications: number of bootstrap samples
Returns array of bootstrap sample medians
"""
medians = make_array()
for i in np.arange(replications):
bootstrap_sample = original_sample.sample()
resampled_median = percentile(50, bootstrap_sample.column(label))
medians = np.append(medians, resampled_median)
return medians
bstrap_medians = bootstrap_median(our_sample, 'Total Compensation', 2000)
resampled_medians = Table().with_column(
'Bootstrap Sample Median', bstrap_medians)
resampled_medians.hist()
plots.scatter(pop_median, 0, color='red', s=40);
left = percentile(2.5, bstrap_medians)
left
right = percentile(97.5, bstrap_medians)
right
resampled_medians.hist()
plots.plot([left, right], [0, 0], color='yellow', lw=10, zorder=1)
plots.scatter(pop_median, 0, color='red', s=50, zorder=2);
confidence_interval = make_array(left, right)
confidence_interval
# THE BIG SIMULATION: This one takes a long time.
# Generate the endpoints of 50 intervals
left_ends = make_array()
right_ends = make_array()
for i in np.arange(50):
first_sample = sf.sample(300, with_replacement=False)
medians = bootstrap_median(first_sample, 'Total Compensation', 2000)
left_ends = np.append(left_ends, percentile(2.5, medians))
right_ends = np.append(right_ends, percentile(97.5, medians))
intervals = Table().with_columns(
'Left', left_ends,
'Right', right_ends
)
intervals
good = intervals.where('Left', are.below(pop_median)).where('Right', are.above(pop_median)).num_rows
good / 50