#!/usr/bin/env python
# coding: utf-8

# In[ ]:


from datascience import *
import numpy as np

get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')


# ## Distributions ##

# In[ ]:


die = Table().with_column('Face', np.arange(1, 7))
die


# In[ ]:


die.sample(10)


# In[ ]:


roll_bins = np.arange(0.5, 6.6, 1)


# In[ ]:


die.hist(bins = roll_bins)
print(1/6)


# In[ ]:


die.sample(10).hist(bins = roll_bins)


# #### Large Random Samples 

# In[ ]:


die.sample(10).hist(bins = roll_bins)


# In[ ]:


die.sample(10).hist(bins = roll_bins)


# In[ ]:


die.sample(100).hist(bins = roll_bins)


# In[ ]:


die.sample(100).hist(bins = roll_bins)


# In[ ]:


die.sample(1000).hist(bins = roll_bins)


# In[ ]:


die.sample(1000).hist(bins = roll_bins)


# In[ ]:


united = Table.read_table('united_summer2015.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')


# In[ ]:


united 


# In[ ]:


united.hist('Delay', bins = np.arange(-20, 201, 10))


# In[ ]:


united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))


# In[ ]:


united.sample(10).hist('Delay', bins = np.arange(-20, 201, 10))


# In[ ]:


united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))


# In[ ]:


united.sample(100).hist('Delay', bins = np.arange(-20, 201, 10))


# ## Simulating a Statistic ##

# In[ ]:


np.median(united.column('Delay'))


# In[ ]:


united.where('Delay', are.below_or_equal_to(2)).num_rows / united.num_rows


# In[ ]:


np.median(united.sample(10).column('Delay'))


# In[ ]:


medians = make_array()

for i in np.arange(10000):
    new_median = np.median(united.sample(1000).column('Delay'))
    medians = np.append(medians, new_median)


# In[ ]:


Table().with_column('Sample Median', medians).hist(bins = np.arange(-0.5, 5.6, 1))


# ## Swain vs. Alabama ##

# In[ ]:


eligible_population = make_array(0.26, 0.74)
eligible_population


# In[ ]:


sample_proportions(100, eligible_population)


# In[ ]:


# statistic: number of black men among random sample 
# of 100 men from eligible population

100 * sample_proportions(100, eligible_population).item(0)


# In[ ]:


# Simulation

counts = make_array()

for i in np.arange(10000):
    new_count = 100 * sample_proportions(100, eligible_population).item(0)
    counts = np.append(counts, new_count)


# In[ ]:


counts


# In[ ]:


# Visualization

Table().with_column('Random Sample Count', counts).hist(bins = np.arange(9.5, 45, 1))

observed_count = 8
plots.scatter(observed_count, 0, color='red', s=50);


# ## Mendel and Pea Flowers ##

# In[ ]:


model = make_array(0.75, 0.25)
model


# In[ ]:


sample_proportions(929, model)


# In[ ]:


# statistic: distance between sample percent (of purple plants) and 75

abs(100 * sample_proportions(929, model).item(0) - 75)


# In[ ]:


# Simulation

distances = make_array()

for i in np.arange(10000):
    new_distance = abs(100 * sample_proportions(929, model).item(0) - 75)
    distances = np.append(distances, new_distance)


# In[ ]:


Table().with_column('Distance from 75%', distances).hist()


# In[ ]:


observed_distance =  abs(100*(705/929) - 75)
observed_distance


# In[ ]:


Table().with_column('Distance from 75%', distances).hist()
plots.scatter(observed_distance, 0, color='red', s=30);


# In[ ]: