In [ ]:

from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

Lecture 23¶

Percentiles¶

In [ ]:

s = [1, 7, 3, 9, 5]
np.sort(s)

In [ ]:

percentile(10, s)==0

In [ ]:

percentile(39, s) == percentile(40, s)

In [ ]:

percentile(40, s) == percentile(41, s)

In [ ]:

percentile(50, s) == 5

In [ ]:

x = make_array(43, 20, 51, 7, 28, 34)
y = np.sort(x)
y

In [ ]:

.55 * len(x)

In [ ]:

percentile(55, x)

In [ ]:

.95 * len(x)

In [ ]:

percentile(95, x)

In [ ]:

.99 * len(x)

In [ ]:

percentile(99, x)

Bootstrap¶

In [ ]:

# Compensation data for employees of the city of San Francisco in 2015
sf = Table.read_table('san_francisco_2015.csv')
sf

In [ ]:

sf.where('Job', 'Mayor')

In [ ]:

sf.sort('Total Compensation', descending=True).show(5)

In [ ]:

sf.sort('Total Compensation', descending=False).show(5)

In [ ]:

sf = sf.where('Total Compensation', are.above(10000))

In [ ]:

sf.num_rows

In [ ]:

sf_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=sf_bins)

In [ ]:

pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

In [ ]:

our_sample = sf.sample(300, with_replacement=False)

In [ ]:

our_sample.hist('Total Compensation', bins=sf_bins)

In [ ]:

est_median = percentile(50, our_sample.column('Total Compensation'))
est_median

In [ ]:

#################################
resample1 = our_sample.sample()

In [ ]:

percentile(50, resample1.column('Total Compensation'))

In [ ]:

def bootstrap_median(original_sample, label, replications):
    """Simulate sample median:
    original_sample: table containing the original sample
    label: label of column containing the variable
    replications: number of bootstrap samples
    Returns array of bootstrap sample medians
    """
    medians = make_array()
    for i in np.arange(replications):
        bootstrap_sample = original_sample.sample()
        resampled_median = percentile(50, bootstrap_sample.column(label))
        medians = np.append(medians, resampled_median)
        
    return medians

In [ ]:

bstrap_medians = bootstrap_median(our_sample, 'Total Compensation', 2000)

In [ ]:

resampled_medians = Table().with_column(
    'Bootstrap Sample Median', bstrap_medians)

resampled_medians.hist()

plots.scatter(pop_median, 0, color='red', s=40);

In [ ]:

left = percentile(2.5, bstrap_medians)
left

In [ ]:

right = percentile(97.5, bstrap_medians)
right

In [ ]:

resampled_medians.hist()

plots.plot([left, right], [0, 0], color='yellow', lw=10, zorder=1)
plots.scatter(pop_median, 0, color='red', s=50, zorder=2);

In [ ]:

confidence_interval = make_array(left, right)
confidence_interval

In [ ]:

# THE BIG SIMULATION: This one takes a long time.

# Generate the endpoints of 50 intervals

left_ends = make_array()
right_ends = make_array()

for i in np.arange(50):
    first_sample = sf.sample(300, with_replacement=False)
    medians = bootstrap_median(first_sample, 'Total Compensation', 2000)
    left_ends = np.append(left_ends, percentile(2.5, medians))
    right_ends = np.append(right_ends, percentile(97.5, medians))

In [ ]:

intervals = Table().with_columns(
    'Left', left_ends,
    'Right', right_ends
)    

In [ ]:

intervals

In [ ]:

good = intervals.where('Left', are.below(pop_median)).where('Right', are.above(pop_median)).num_rows

In [ ]:

good / 50

In [ ]: