#!/usr/bin/env python # coding: utf-8 # In[ ]: from datascience import * import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plots plots.style.use('fivethirtyeight') # ## Distributions ## # In[ ]: die = Table().with_column('Face', np.arange(1, 7)) die # In[ ]: die.sample(10) # In[ ]: roll_bins = np.arange(0.5, 6.6, 1) # In[ ]: die.hist(bins = roll_bins) print(1/6) # In[ ]: die.sample(10).hist(bins = roll_bins) # #### Large Random Samples # In[ ]: die.sample(10).hist(bins = roll_bins) # In[ ]: die.sample(10).hist(bins = roll_bins) # In[ ]: die.sample(100).hist(bins = roll_bins) # In[ ]: die.sample(100).hist(bins = roll_bins) # In[ ]: die.sample(1000).hist(bins = roll_bins) # In[ ]: die.sample(1000).hist(bins = roll_bins) # In[ ]: united = Table.read_table('united_summer2015.csv') united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row') # In[ ]: united # In[ ]: united.hist('Delay', bins = np.arange(-20, 201, 10)) # In[ ]: united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10)) # In[ ]: united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10)) # In[ ]: united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10)) # In[ ]: united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10)) # ## Simulating a Statistic ## # In[ ]: np.median(united.column('Delay')) # In[ ]: united.where('Delay', are.below_or_equal_to(2)).num_rows / united.num_rows # In[ ]: np.median(united.sample(10).column('Delay')) # In[ ]: medians = make_array() for i in np.arange(10000): new_median = np.median(united.sample(1000).column('Delay')) medians = np.append(medians, new_median) # In[ ]: Table().with_column('Sample Median', medians).hist(bins = np.arange(-0.5, 5.6, 1)) # ## Swain vs. Alabama ## # In[ ]: eligible_population = make_array(0.26, 0.74) eligible_population # In[ ]: sample_proportions(100, eligible_population) # In[ ]: # statistic: number of black men among random sample # of 100 men from eligible population 100 * sample_proportions(100, eligible_population).item(0) # In[ ]: # Simulation counts = make_array() for i in np.arange(10000): new_count = 100 * sample_proportions(100, eligible_population).item(0) counts = np.append(counts, new_count) # In[ ]: counts # In[ ]: # Visualization Table().with_column('Random Sample Count', counts).hist(bins = np.arange(9.5, 45, 1)) observed_count = 8 plots.scatter(observed_count, 0, color='red', s=50); # ## Mendel and Pea Flowers ## # In[ ]: model = make_array(0.75, 0.25) model # In[ ]: sample_proportions(929, model) # In[ ]: # statistic: distance between sample percent (of purple plants) and 75 abs(100 * sample_proportions(929, model).item(0) - 75) # In[ ]: # Simulation distances = make_array() for i in np.arange(10000): new_distance = abs(100 * sample_proportions(929, model).item(0) - 75) distances = np.append(distances, new_distance) # In[ ]: Table().with_column('Distance from 75%', distances).hist() # In[ ]: observed_distance = abs(100*(705/929) - 75) observed_distance # In[ ]: Table().with_column('Distance from 75%', distances).hist() plots.scatter(observed_distance, 0, color='red', s=30); # In[ ]: