In [4]:
''''
-------------------------------------------------------------------------------
Filename   : ch2.ipynb
Date       : 2012-04-30
Author     : C. Vogel
Purpose    : Replicate analysis of height and weight data in Chapter 2 of 
           : _Machine Learning for Hackers_.
Input Data : 01_heights_weights_genders.tsv is available at the book's github 
           : repository at https://github.com/johnmyleswhite/ML_for_Hackers.git
Libraries  : Numpy 1.6.1, Matplotlib 1.1.0, Pandas 0.7.3, scipy 0.10.0, 
           : statsmodels 0.4.0
-------------------------------------------------------------------------------

This notebook is a Python port of the R code in Chapter 2 of _Machine Learning
for Hackers_ by D. Conway and J.M. White.

Running the notebook will produce 9 PNG figures and save them to the working 
directory.

The height/weight dataset CSV file should be located in a /data/ subfolder of 
the working directory. 

For a detailed description of the analysis and the process of porting it
to Python, see: slendrmeans.wordpress.com/will-it-python.
'''
Out[4]:
"'\n-------------------------------------------------------------------------------\nFilename   : ch2.ipynb\nDate       : 2012-04-30\nAuthor     : C. Vogel\nPurpose    : Replicate analysis of height and weight data in Chapter 2 of \n           : _Machine Learning for Hackers_.\nInput Data : 01_heights_weights_genders.tsv is available at the book's github \n           : repository at https://github.com/johnmyleswhite/ML_for_Hackers.git\nLibraries  : Numpy 1.6.1, Matplotlib 1.1.0, Pandas 0.7.3, scipy 0.10.0, \n           : statsmodels 0.4.0\n-------------------------------------------------------------------------------\n\nThis notebook is a Python port of the R code in Chapter 2 of _Machine Learning\nfor Hackers_ by D. Conway and J.M. White.\n\nRunning the notebook will produce 9 PNG figures and save them to the working \ndirectory.\n\nThe height/weight dataset CSV file should be located in a /data/ subfolder of \nthe working directory. \n\nFor a detailed description of the analysis and the process of porting it\nto Python, see: slendrmeans.wordpress.com/will-it-python.\n"
In [7]:
import numpy as np
from pandas import *
import matplotlib.pyplot as plt
import os
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDE
from statsmodels.nonparametric import lowess
from statsmodels.api import GLM, Logit
In [8]:
# Numeric Summaries
# p. 37
In [9]:
# Import the height and weights data
heights_weights = read_table('data/01_heights_weights_genders.csv', sep = ',', header = 0)
In [10]:
# Assign the heights column to its own series, and describe it.
heights = heights_weights['Height']
heights.describe()
Out[10]:
count    10000.000000
mean        66.367560
std          3.847528
min         54.263133
25%         63.505620
50%         66.318070
75%         69.174262
max         78.998742
In [11]:
# Means, medians, and modes (p. 38)
In [12]:
def my_mean(x):
    return float(np.sum(x)) / len(x)
In [13]:
def my_median(x):
    '''
    Compute the median of a series x.
    '''
    
    # Get a sorted copy of the values in the series (need to call values
    # otherwise integer indexing messes things up.)
    sorted_x = np.sort(x.values)
    if len(x) % 2 == 0:
        indices = [0.5 * len(x) - 1, 0.5 * len(x)]
        return np.mean(sorted_x[indices])
    else:
        # Ceil(x) - 1 = Floor(x), but this is to make clear that the -1 is to
        # account for 0-based counting.
        index = ceil(0.5 * len(x)) - 1
        return sorted_x.ix[index]
In [14]:
# Check my_mean and my_median against built-ins
In [15]:
my_mean(heights) - heights.mean()
Out[15]:
0.0
In [16]:
my_median(heights) - heights.median()
Out[16]:
0.0
In [17]:
# Quantiles (p. 40)
In [18]:
heights.min(), heights.max()
Out[18]:
(54.263133325097101, 78.998742346389605)
In [19]:
# Range = max - min. Note: np.ptp(heights.values) will do the same thing.
# HT Nathaniel Smith
In [20]:
def my_range(s):
    '''
    Difference between the max and min of an array or Series
    '''
    return s.max() - s.min()

my_range(heights)
Out[20]:
24.735609021292504
In [21]:
# Similarly, pandas doesn't seem to provide multiple quantiles. 
# But (1) the standard ones are available via .describe() and
# (2) creating one is simple.
In [22]:
# To get a single quantile
heights.quantile(.5)
Out[22]:
66.31807008178464
In [23]:
# Function to get arbitrary quantiles of a series.
def my_quantiles(s, prob = (0.0, 0.25, 0.5, 1.0)):
    '''
    Calculate quantiles of a series.

    Parameters:
    -----------
    s : a pandas Series 
    prob : a tuple (or other iterable) of probabilities at 
           which to compute quantiles. Must be an iterable,
           even for a single probability (e.g. prob = (0.50)
           not prob = 0.50).

    Returns:
    --------
    A pandas series with the probabilities as an index.
    '''
    q = [s.quantile(p) for p in prob]
    return Series(q, index = prob)

# With the default prob argument   
my_quantiles(heights)
Out[23]:
0.00    54.263133
0.25    63.505620
0.50    66.318070
1.00    78.998742
In [24]:
# With a specific prob argument - here deciles
my_quantiles(heights, prob = arange(0, 1.1, 0.1))
Out[24]:
0.0    54.263133
0.1    61.412701
0.2    62.859007
0.3    64.072407
0.4    65.194221
0.5    66.318070
0.6    67.435374
0.7    68.558072
0.8    69.811620
0.9    71.472149
1.0    78.998742
In [25]:
 # Standard deviation and variances
In [26]:
def my_var(x):
    return np.sum((x - x.mean())**2) / (len(x) - 1)
In [27]:
my_var(heights) - heights.var()
Out[27]:
-2.2854607095723622e-11
In [28]:
def my_sd(x):
    return np.sqrt(my_var(x))
In [29]:
my_sd(heights) - heights.std()
Out[29]:
-2.9700686354772188e-12
In [30]:
# Exploratory Data Visualization (p. 44)
In [31]:
# Histograms
In [32]:
# 1-inch bins
bins1 = np.arange(heights.min(), heights.max(), 1.)
heights.hist(bins = bins1, fc = 'steelblue')
plt.savefig('height_hist_bins1.png')
In [33]:
# 5-inch bins
bins5 = np.arange(heights.min(), heights.max(), 5.)
heights.hist(bins = bins5, fc = 'steelblue')
plt.savefig('height_hist_bins5.png')
In [122]:
# 0.001-inch bins
bins001 = np.arange(heights.min(), heights.max(), .001)
heights.hist(bins = bins001, fc = 'steelblue')
plt.savefig('height_hist_bins001.png')
In [28]:
# Kernel density estimators, from scipy.stats.
In [34]:
# Create a KDE ojbect
heights_kde = KDE(heights)
# Use fit() to estimate the densities. Default is gaussian kernel 
# using fft. This will provide a "density" attribute.
heights_kde.fit()
In [35]:
# Plot the density of the heights
# Sort inside the plotting so the lines connect nicely.
fig = plt.figure()
plt.plot(heights_kde.support, heights_kde.density)
plt.savefig('heights_density.png')
In [36]:
# Pull out male and female heights as arrays over which to compute densities

heights_m = heights[heights_weights['Gender'] == 'Male'].values
heights_f = heights[heights_weights['Gender'] == 'Female'].values
heights_m_kde = KDE(heights_m)
heights_f_kde = KDE(heights_f)
heights_m_kde.fit()
heights_f_kde.fit()

fig = plt.figure()
plt.plot(heights_m_kde.support, heights_m_kde.density, label = 'Male')
plt.plot(heights_f_kde.support, heights_f_kde.density, label = 'Female')
plt.legend()
plt.savefig('height_density_bysex.png')                
In [37]:
# Do the same thing with weights.
weights_m = heights_weights[heights_weights['Gender'] == 'Male']['Weight'].values
weights_f = heights_weights[heights_weights['Gender'] == 'Female']['Weight'].values
weights_m_kde = KDE(weights_m)
weights_f_kde = KDE(weights_f)
weights_m_kde.fit()
weights_f_kde.fit()


fig = plt.figure()
plt.plot(weights_m_kde.support, weights_f_kde.density, label = 'Male')
plt.plot(weights_f_kde.support, weights_f_kde.density, label = 'Female')
plt.legend()
plt.savefig('weight_density_bysex.png')
In [38]:
# Subplot weight density by sex.
fig, axes = plt.subplots(nrows = 2, ncols = 1, sharex = True, figsize = (9, 6))
plt.subplots_adjust(hspace = 0.1)
axes[0].plot(weights_f_kde.support, weights_f_kde.density, label = 'Female')
axes[0].xaxis.tick_top()
axes[0].legend()
axes[1].plot(weights_m_kde.support, weights_f_kde.density, label = 'Male')
axes[1].legend()
plt.savefig('weight_density_bysex_subplot.png')
In [39]:
# Scatter plot. Pull weight (both sexes) out as a separate array first, like 
# we did with height above.
In [40]:
weights = heights_weights['Weight']
plt.plot(heights, weights, '.k', mew = 0, alpha = .1)
plt.savefig('height_weight_scatter.png')
In [41]:
# Lowess smoothing - this seems to be new functionality not yet in docs (as of 0.40, April 2012).
In [46]:
lowess_line = lowess.lowess(weights, heights)
In [48]:
plt.figure(figsize = (13, 9))
plt.plot(heights, weights, '.', mfc = 'steelblue', mew=0, alpha = .25)
plt.plot(lowess_line[:,0], lowess_line[:, 1],  '-', color = '#461B7E', label = "Lowess fit")
plt.legend(loc = "upper left")
plt.savefig('height_weight_lowess.png')
In [49]:
# Logistic regression of sex on height and weight
# Sex is coded in the binary variable `male`.
In [50]:
# LHS binary variable
male = (heights_weights['Gender'] == 'Male') * 1
In [51]:
# Matrix of predictor variables: hieght and weight from data frame
# into an Nx2 array.
hw_exog = heights_weights[['Height', 'Weight']].values
In [52]:
# Logit model 1: Using GLM and the Binomial Family w/ the Logit Link
# Note I have to add constants to the `exog` matrix. The prepend = True
# argument prevents a warning about future change to the default argument.
logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit))
logit_model.fit().summary()
Out[52]:
Generalized Linear Model Regression Results
Dep. Variable: Gender No. Observations: 10000
Model: GLM Df Residuals: 9997
Model Family: Binomial Df Model: 2
Link Function: logit Scale: 1.0
Method: IRLS Log-Likelihood: -2091.3
Date: Tue, 08 May 2012 Deviance: 4182.6
Time: 14:24:49 Pearson chi2: 9.72e+03
No. Iterations: 8
coef std err t P>|t| [95.0% Conf. Int.]
const 0.6925 1.328 0.521 0.602 -1.911 3.296
x1 -0.4926 0.029 -17.013 0.000 -0.549 -0.436
x2 0.1983 0.005 38.663 0.000 0.188 0.208
In [53]:
# Get the coefficient parameters.
logit_pars = logit_model.fit().params
In [54]:
# Logit model 2: Using the Logit function.
logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True))
logit_model2.fit().summary()
Optimization terminated successfully.
         Current function value: 2091.297971
         Iterations 8
Out[54]:
Logit Regression Results
Dep. Variable: Gender No. Observations: 10000
Model: Logit Df Residuals: 9997
Method: MLE Df Model: 2
Date: Tue, 08 May 2012 Pseudo R-squ.: 0.6983
Time: 14:24:51 Log-Likelihood: -2091.3
converged: True LL-Null: -6931.5
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
const 0.6925 1.328 0.521 0.602 -1.911 3.296
x1 -0.4926 0.029 -17.013 0.000 -0.549 -0.436
x2 0.1983 0.005 38.663 0.000 0.188 0.208
In [55]:
# Get the coefficient parameters
logit_pars2 = logit_model2.fit().params
Optimization terminated successfully.
         Current function value: 2091.297971
         Iterations 8
In [56]:
# Compare the two methods again. They give the same parameters.
DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2})
Out[56]:
GLM Logit
const 0.692543 0.692543
x1 -0.492620 -0.492620
x2 0.198340 0.198340
In [57]:
# Draw a separating line in the [height, weight]-space.
# The line will separate the space into predicted-male
# and predicted-female regions.
In [58]:
# Get the intercept and slope of the line based on the logit coefficients 
intercept = -logit_pars['const'] / logit_pars['x2']
slope =  -logit_pars['x1'] / logit_pars['x2']
In [59]:
# Plot the data and the separating line
# Color code male and female points.
fig = plt.figure(figsize = (10, 8))
plt.plot(heights_f, weights_f, '.', label = 'Female', mew = 0, mfc='coral', alpha = .1)
plt.plot(heights_m, weights_m, '.', label = 'Male', mew = 0, mfc='steelblue', alpha = .1)
plt.plot(array([50, 80]), intercept + slope * array([50, 80]), '-', color = '#461B7E')
plt.legend(loc='upper left')
plt.savefig('height_weight_class.png')