from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
#Birth weight in ounces; Maternal Pregnancy weight in pounds
baby = Table.read_table('baby.csv')
baby
smoking_and_birthweight = baby.select('Birth Weight', 'Maternal Smoker')
smoking_and_birthweight
smoking_and_birthweight.group('Maternal Smoker')
smoking_and_birthweight.hist('Birth Weight')
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')
observed_means = smoking_and_birthweight.group('Maternal Smoker', np.average)
observed_means
################################
observed_diff = observed_means.column(1).item(0) - observed_means.column(1).item(1)
observed_diff
def diff_between_group_means(tbl):
means = tbl.group('Maternal Smoker', np.average)
return means.column(1).item(0) - means.column(1).item(1)
# PLAN:
# Shuffle birth weights
# Assign some to group A and some to group B
# Find difference between averages of the two groups (statistic)
# Repeat
weights = smoking_and_birthweight.select('Birth Weight')
weights
smoking = smoking_and_birthweight.select('Maternal Smoker')
smoking
# Shuffle birth weights
shuffled_weights = weights.sample(with_replacement=False).column(0)
shuffled_weights
# Assign some to group A and some to group B
shuffled = smoking.with_column('Shuffled weights', shuffled_weights)
shuffled
# Find difference between averages of the two groups (statistic)
diff = diff_between_group_means(shuffled)
diff
# Repeat
diffs = make_array()
for i in np.arange(2000):
shuffled_weights = weights.sample(with_replacement=False).column(0)
shuffled = smoking.with_column('Shuffled weights', shuffled_weights)
means = shuffled.group('Maternal Smoker', np.average)
diff = means.column(1).item(0) - means.column(1).item(1)
diffs = np.append(diffs, diff)
diffs
Table().with_column('Difference between group means', diffs).hist()
observed_diff
#Pressure measured in "pounds per square inch" (psi)
#Two official (Blakeman and Prioleau) measured pressure
# of balls at half-time
# Most of Colts balls were not measured
football = Table.read_table('deflategate.csv')
football.show()
#Take average of two measurements
football = football.select('Team').with_column(
'Combined', (football.column('Blakeman')+football.column('Prioleau'))/2
)
football.show()
np.ones(5)
#Before the start of AFC game, ball pressures were measured
#NFL rule: ball pressure between 12.5 and 13.5 psi
#Patriots' balls were all about 12.5 psi
#Colts' balls were about 13.0 psi
start = np.append(12.5 * np.ones(11), 13 * np.ones(4))
start
# Some deflation is normal during game; investigate the pressure drop
drop_values = start - football.column('Combined')
drop_values
football = football.drop('Combined').with_column('Drop', drop_values)
football.show()
means_tbl = football.group('Team', np.average)
means_tbl
drop_avgs = means_tbl.column('Drop average')
observed_difference = drop_avgs.item(1) - drop_avgs.item(0)
observed_difference
#Large values imply larger drop for Patriots
# Do the 11 Patriot balls look like a random sample of the 15 balls?
# Could this difference be due to chance?
group_labels = football.select('Team')
drops = football.select('Drop')
group_labels
differences = make_array()
for i in np.arange(20000):
shuffled_drops = drops.sample(with_replacement = False).column('Drop')
shuffled_tbl = group_labels.with_column('Shuffled Drop', shuffled_drops)
means_tbl = shuffled_tbl.group('Team', np.average)
drop_avgs = means_tbl.column('Shuffled Drop average')
new_diff = drop_avgs.item(1) - drop_avgs.item(0)
differences = np.append(differences, new_diff)
Table().with_column('Difference Between Means', differences).hist()
plots.scatter(observed_difference, 0, color='red', s=40);
np.count_nonzero(differences >= observed_difference) / 20000