import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
values = make_array(2, 3, 3, 9)
sum(values)/len(values)
np.average(values)
np.mean(values)
(2 + 3 + 3 + 9)/4
2*(1/4) + 3*(2/4) + 9*(1/4)
values_table = Table().with_columns('value', values)
values_table
bins_for_display = np.arange(0.5, 10.6, 1)
values_table.hist('value', bins = bins_for_display)
## Make array of 10 2s, 20 3s, and 10 9s
new_vals = make_array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9)
Table().with_column('value', new_vals).hist(bins = bins_for_display)
np.average(values)
np.average(new_vals)
sd_table = Table().with_columns('Value', values)
sd_table
average_value = np.average(sd_table.column(0))
average_value
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table
sum(deviations)
sd_table = sd_table.with_columns('Squared Deviation', deviations ** 2)
sd_table
# Variance of the data
variance = np.mean(sd_table.column('Squared Deviation'))
variance
# Standard Deviation (SD) is the square root of the variance
sd = variance ** 0.5
sd
np.std(values)
births = Table.read_table('baby.csv')
births.show(3)
births.hist(overlay = False)
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd
within_3_SDs = births.where('Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))
# Proportion within 3 SDs of the mean
within_3_SDs.num_rows / births.num_rows
# Chebyshev's bound:
# The proportion we calculated above should be at least
1 - 1/(3**2)
births.labels
# See if Chebyshev's bounds work for distributions with various shapes
for feature in births.labels:
values = births.column(feature)
mean = np.mean(values)
sd = np.std(values)
print()
print(feature)
for z in make_array(2, 3, 4, 5):
chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
proportion = chosen.num_rows / births.num_rows
percent = round(proportion * 100, 2)
print('Average plus or minus', z, 'SDs:', percent, '% of the data')
def standard_units(x):
"""Convert array x to standard units."""
return (x - np.mean(x)) / np.std(x)
ages = births.column('Maternal Age')
ages_standard_units = standard_units(ages)
np.mean(ages_standard_units), np.std(ages_standard_units)
both = Table().with_columns(
'Age in Years', ages,
'Age in Standard Units', ages_standard_units
)
both
np.mean(ages), np.std(ages)
both.hist('Age in Years', bins = np.arange(15, 46, 2))
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)
np.mean(heights) + np.std(heights), np.mean(heights) - np.std(heights)