from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
model = make_array(0.75, 0.25)
sample_proportions(929, model)
# statistic: distance between sample percent (of purple plants) and 75
abs(100 * sample_proportions(929, model).item(0) - 75)
# Simulation
distances = make_array()
for i in np.arange(10000):
new_distance = abs(100 * sample_proportions(929, model).item(0) - 75)
distances = np.append(distances, new_distance)
Table().with_column('Distance from 75%', distances).hist()
# 705 of Mendel's 929 plants were purple flowering
observed_distance = abs(100*(705/929) - 75)
observed_distance
Table().with_column('Distance from 75%', distances).hist()
plots.scatter(observed_distance, 0, color='red', s=30);
jury = Table().with_columns(
'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)
jury
jury.barh('Ethnicity')
#####
jury_with_diffs = jury.with_column(
'Difference', jury.column('Panels') - jury.column('Eligible')
)
jury_with_diffs
jury_with_diffs = jury_with_diffs.with_column(
'Absolute Difference', np.abs(jury_with_diffs.column('Difference'))
)
jury_with_diffs
sum(jury_with_diffs.column('Absolute Difference'))
sum(jury_with_diffs.column('Absolute Difference')) / 2
def total_variation_distance(distribution_1, distribution_2):
return sum(np.abs(distribution_1 - distribution_2)) / 2
total_variation_distance(jury.column('Eligible'), jury.column('Panels'))
eligible = jury.column('Eligible')
sample_distribution = sample_proportions(1453, eligible)
panels_and_sample = jury.with_column('Random Sample', sample_distribution)
panels_and_sample
panels_and_sample.barh('Ethnicity')
total_variation_distance(panels_and_sample.column('Random Sample'), eligible)
observed_tvd = total_variation_distance(jury.column('Panels'), eligible)
observed_tvd
sample_distribution = sample_proportions(1453, eligible)
total_variation_distance(sample_distribution, eligible)
tvds = make_array()
for i in np.arange(10000):
sample_distribution = sample_proportions(1453, eligible)
new_tvd = total_variation_distance(sample_distribution, eligible)
tvds = np.append(tvds, new_tvd)
Table().with_column('Total Variation Distance', tvds).hist(bins = 20)