In [ ]:

from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Lecture 19¶

Birth weights¶

In [ ]:

#Birth weight in ounces; Maternal Pregnancy weight in pounds
baby = Table.read_table('baby.csv')
baby

In [ ]:

smoking_and_birthweight = baby.select('Birth Weight', 'Maternal Smoker')
smoking_and_birthweight

In [ ]:

smoking_and_birthweight.group('Maternal Smoker')

In [ ]:

smoking_and_birthweight.hist('Birth Weight')

In [ ]:

smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')

In [ ]:

observed_means = smoking_and_birthweight.group('Maternal Smoker', np.average)
observed_means

In [ ]:

################################
observed_diff = observed_means.column(1).item(0) - observed_means.column(1).item(1)
observed_diff

In [ ]:

def diff_between_group_means(tbl):
    means = tbl.group('Maternal Smoker', np.average)
    return means.column(1).item(0) - means.column(1).item(1)

In [ ]:

# PLAN:

# Shuffle birth weights

# Assign some to group A and some to group B

# Find difference between averages of the two groups (statistic)

# Repeat

In [ ]:

weights = smoking_and_birthweight.select('Birth Weight')
weights

In [ ]:

smoking = smoking_and_birthweight.select('Maternal Smoker')
smoking

In [ ]:

# Shuffle birth weights
shuffled_weights = weights.sample(with_replacement=False).column(0)
shuffled_weights

In [ ]:

# Assign some to group A and some to group B
shuffled = smoking.with_column('Shuffled weights', shuffled_weights)
shuffled

In [ ]:

# Find difference between averages of the two groups (statistic)
diff = diff_between_group_means(shuffled)
diff

In [ ]:

# Repeat
diffs = make_array()
for i in np.arange(2000):
    shuffled_weights = weights.sample(with_replacement=False).column(0)
    shuffled = smoking.with_column('Shuffled weights', shuffled_weights)
    means = shuffled.group('Maternal Smoker', np.average)
    diff = means.column(1).item(0) - means.column(1).item(1)
    diffs = np.append(diffs, diff)

diffs

In [ ]:

Table().with_column('Difference between group means', diffs).hist()

In [ ]:

observed_diff

Deflategate¶

In [ ]:

#Pressure measured in "pounds per square inch" (psi)
#Two official (Blakeman and Prioleau) measured pressure 
# of balls at half-time
# Most of Colts balls were not measured
football = Table.read_table('deflategate.csv')
football.show()

In [ ]:

#Take average of two measurements
football = football.select('Team').with_column(
    'Combined', (football.column('Blakeman')+football.column('Prioleau'))/2
    )
football.show()

In [ ]:

np.ones(5)

In [ ]:

#Before the start of AFC game, ball pressures were measured
#NFL rule:  ball pressure between 12.5 and 13.5 psi
#Patriots' balls were all about 12.5 psi
#Colts' balls were about 13.0 psi
start = np.append(12.5 * np.ones(11), 13 * np.ones(4))
start

In [ ]:

# Some deflation is normal during game; investigate the pressure drop
drop_values = start - football.column('Combined')
drop_values

In [ ]:

football = football.drop('Combined').with_column('Drop', drop_values)
football.show()

In [ ]:

means_tbl = football.group('Team', np.average)
means_tbl

In [ ]:

drop_avgs = means_tbl.column('Drop average')
observed_difference = drop_avgs.item(1) - drop_avgs.item(0)
observed_difference
#Large values imply larger drop for Patriots

In [ ]:

# Do the 11 Patriot balls look like a random sample of the 15 balls?
# Could this difference be due to chance?

In [ ]:

group_labels = football.select('Team')
drops = football.select('Drop')
group_labels

In [ ]:

differences = make_array()

for i in np.arange(20000):
    shuffled_drops = drops.sample(with_replacement = False).column('Drop')
    shuffled_tbl = group_labels.with_column('Shuffled Drop', shuffled_drops)
    means_tbl = shuffled_tbl.group('Team', np.average)
    drop_avgs = means_tbl.column('Shuffled Drop average')
    new_diff = drop_avgs.item(1) - drop_avgs.item(0)
    differences = np.append(differences, new_diff)

In [ ]:

Table().with_column('Difference Between Means', differences).hist()
plots.scatter(observed_difference, 0, color='red', s=40);

In [ ]:

np.count_nonzero(differences >= observed_difference) / 20000

In [ ]: