In [ ]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Lecture 17

Mendel and Pea Flowers

In [ ]:
model = make_array(0.75, 0.25)
In [ ]:
sample_proportions(929, model)
In [ ]:
# statistic: distance between sample percent (of purple plants) and 75

abs(100 * sample_proportions(929, model).item(0) - 75)
In [ ]:
# Simulation

distances = make_array()

for i in np.arange(10000):
    new_distance = abs(100 * sample_proportions(929, model).item(0) - 75)
    distances = np.append(distances, new_distance)
In [ ]:
Table().with_column('Distance from 75%', distances).hist()
In [ ]:
# 705 of Mendel's 929 plants were purple flowering

observed_distance =  abs(100*(705/929) - 75)
observed_distance
In [ ]:
Table().with_column('Distance from 75%', distances).hist()
plots.scatter(observed_distance, 0, color='red', s=30);

Alameda County Jury Panels

In [ ]:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury
In [ ]:
jury.barh('Ethnicity')
In [ ]:
#####
In [ ]:
jury_with_diffs = jury.with_column(
    'Difference', jury.column('Panels') - jury.column('Eligible')
)
In [ ]:
jury_with_diffs
In [ ]:
jury_with_diffs = jury_with_diffs.with_column(
    'Absolute Difference', np.abs(jury_with_diffs.column('Difference'))
)
In [ ]:
jury_with_diffs
In [ ]:
sum(jury_with_diffs.column('Absolute Difference'))
In [ ]:
sum(jury_with_diffs.column('Absolute Difference')) / 2
In [ ]:
def total_variation_distance(distribution_1, distribution_2):
    return sum(np.abs(distribution_1 - distribution_2)) / 2
In [ ]:
total_variation_distance(jury.column('Eligible'), jury.column('Panels'))
In [ ]:
eligible = jury.column('Eligible')
In [ ]:
sample_distribution = sample_proportions(1453, eligible)
panels_and_sample = jury.with_column('Random Sample', sample_distribution)
In [ ]:
panels_and_sample
In [ ]:
panels_and_sample.barh('Ethnicity')
In [ ]:
total_variation_distance(panels_and_sample.column('Random Sample'), eligible)
In [ ]:
observed_tvd = total_variation_distance(jury.column('Panels'), eligible)
observed_tvd
In [ ]:
sample_distribution = sample_proportions(1453, eligible)
total_variation_distance(sample_distribution, eligible)
In [ ]:
tvds = make_array()

for i in np.arange(10000):
    sample_distribution = sample_proportions(1453, eligible)
    new_tvd = total_variation_distance(sample_distribution, eligible)
    tvds = np.append(tvds, new_tvd)
    
In [ ]:
Table().with_column('Total Variation Distance', tvds).hist(bins = 20)
In [ ]: