import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
births = Table.read_table('baby.csv')
births.labels
births.drop(5).hist(overlay = False)
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd
within_3_SDs = births.where('Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))
within_3_SDs.num_rows/births.num_rows
1 - 1/3**2
# See if Chebyshev's bounds work for different distributions
for k in births.labels:
values = births.column(k)
mean = np.mean(values)
sd = np.std(values)
print()
print(k)
for z in np.arange(2, 6):
chosen = births.where(k, are.between(mean - z*sd, mean + z*sd))
proportion = chosen.num_rows/births.num_rows
percent = round(proportion * 100, 2)
print('Average plus or minus', z, 'SDs:', percent, '%')
def standard_units(x):
"""Convert array x to standard units."""
return (x - np.mean(x))/np.std(x)
ages = births.column('Maternal Age')
ages_standard_units = standard_units(ages)
ages_standard_units
np.mean(ages_standard_units), np.std(ages_standard_units)
both = Table().with_columns(
'Age in Years', ages,
'Age in Standard Units', ages_standard_units
)
both
#####################
np.mean(ages), np.std(ages)
both.hist('Age in Years', bins = np.arange(15, 46, 2))
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)
births.hist('Birth Weight')
bw = births.column('Birth Weight')
mean_w = np.mean(bw)
sd_w = np.std(bw)
mean_w, sd_w
# Roulette: 38 pockets
# bets on red pays 1 to 1
red_winnings = np.append(1*np.ones(18), -1*np.ones(20))
red = Table().with_columns('Winnings on Red', red_winnings)
red.show()
red.hist(bins = np.arange(-1.5, 1.6, 1))
18/38 #chance of making $1 from bet placed on red
num_bets = 100 #different spins of the roulette with bets on red
net_gains = make_array() #amount won from num_bets on red
for i in np.arange(20000):
spins = red.sample(num_bets)
new_net_gain = sum(spins.column('Winnings on Red'))
net_gains = np.append(net_gains, new_net_gain)
#Bell-shaped histogram
#(even though original distribution was far from bell-shaped)
Table().with_columns('Net Gain', net_gains).hist()
plots.xticks(np.arange(-45, 36, 10));
np.average(net_gains)
np.std(net_gains)
united = Table.read_table('united_summer2015.csv')
united
united.hist('Delay', bins = np.arange(-20, 300, 10))
delays = united.column('Delay')
mean_delay = np.mean(delays)
sd_delay = np.std(delays)
mean_delay, sd_delay
united = united.with_columns(
'Delay in Standard Units', standard_units(delays)
)
united.sort('Delay', descending=True)
#Chebychev: at least 89% within 3 SD
chosen = united.where('Delay in Standard Units', are.between(-3, 3))
chosen.num_rows/united.num_rows
united.hist('Delay', bins = np.arange(-20, 300, 10))
#Take random sample from population of size sample_size
#Repeat to get empirical distribution of sample average
sample_size = 400
means = make_array()
for i in np.arange(10000):
sampled_flights = united.sample(sample_size)
sample_mean = np.mean(sampled_flights.column('Delay'))
means = np.append(means, sample_mean)
Table().with_columns('Sample Mean', means).hist(bins = 20)
plots.title('Sample Means: Sample Size ' + str(sample_size))
plots.xlabel('Random Sample Mean');
np.mean(means), np.std(means)
mean_delay, sd_delay
sd_delay/np.sqrt(sample_size)