%pylab inline

import numpy as np
from pandas import *
import matplotlib.pyplot as plt
import os
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDE
from statsmodels.nonparametric.smoothers_lowess import lowess
from statsmodels.formula.api import logit, glm

heights_weights = read_table('data/01_heights_weights_genders.csv', 
                             sep = ',', header = 0)
print heights_weights.head()
print heights_weights['Gender'].value_counts()

heights = heights_weights['Height']
heights.describe()

def my_mean(x):
    return float(np.sum(x)) / len(x)

def my_median(x):
    '''
    Compute the median of a series x.
    '''
    
    # Get a sorted copy of the values in the series (need to call values
    # otherwise integer indexing messes things up.)
    sorted_x = np.sort(x.values)
    if len(x) % 2 == 0:
        indices = [0.5 * len(x) - 1, 0.5 * len(x)]
        return np.mean(sorted_x[indices])
    else:
        # Ceil(x) - 1 = Floor(x), but this is to make clear that the -1 is to
        # account for 0-based counting.
        index = ceil(0.5 * len(x)) - 1
        return sorted_x.ix[index]

my_mean(heights) - heights.mean()

my_median(heights) - heights.median()

print 'Range of heights (in.): ', heights.min(), heights.max()

# Range = max - min. Note: np.ptp(heights.values) will do the same thing.
# HT Nathaniel Smith

def my_range(s):
    '''
    Difference between the max and min of an array or Series
    '''
    return s.max() - s.min()

print 'Range width of heights (in.) (my_range()): ', my_range(heights)
print 'Range width of heights (in.) (np.ptp()): ',np.ptp(heights)

heights.quantile(.5)

def my_quantiles(s, prob = (0.0, 0.25, 0.5, 1.0)):
    '''
    Calculate quantiles of a series.

    Parameters:
    -----------
    s : a pandas Series 
    prob : a tuple (or other iterable) of probabilities at 
           which to compute quantiles. Must be an iterable,
           even for a single probability (e.g. prob = (0.50)
           not prob = 0.50).

    Returns:
    --------
    A pandas series with the probabilities as an index.
    '''
    q = [s.quantile(p) for p in prob]
    return Series(q, index = prob)

# With the default prob argument   
my_quantiles(heights)

my_quantiles(heights, prob = arange(0, 1.1, 0.1))

def my_var(x):
    return np.sum((x - x.mean())**2) / (len(x) - 1)

my_var(heights) - heights.var()

def my_sd(x):
    return np.sqrt(my_var(x))

my_sd(heights) - heights.std()

# 1-inch bins
bins1 = np.arange(heights.min(), heights.max(), 1.)
heights.hist(bins = bins1, fc = 'steelblue')

# 5-inch bins
bins5 = np.arange(heights.min(), heights.max(), 5.)
heights.hist(bins = bins5, fc = 'steelblue')

# 0.001-inch bins
bins001 = np.arange(heights.min(), heights.max(), .001)
heights.hist(bins = bins001, fc = 'steelblue')

heights_kde = KDE(heights)
heights_kde.fit()

fig = plt.figure()
plt.plot(heights_kde.support, heights_kde.density)

heights_m = heights[heights_weights['Gender'] == 'Male'].values
heights_f = heights[heights_weights['Gender'] == 'Female'].values
heights_m_kde = KDE(heights_m)
heights_f_kde = KDE(heights_f)
heights_m_kde.fit()
heights_f_kde.fit()

fig = plt.figure()
plt.plot(heights_m_kde.support, heights_m_kde.density, label = 'Male')
plt.plot(heights_f_kde.support, heights_f_kde.density, label = 'Female')
plt.legend()

weights_m = heights_weights[heights_weights['Gender'] == 'Male']['Weight'].values
weights_f = heights_weights[heights_weights['Gender'] == 'Female']['Weight'].values
weights_m_kde = KDE(weights_m)
weights_f_kde = KDE(weights_f)
weights_m_kde.fit()
weights_f_kde.fit()


fig = plt.figure()
plt.plot(weights_m_kde.support, weights_f_kde.density, label = 'Male')
plt.plot(weights_f_kde.support, weights_f_kde.density, label = 'Female')
plt.legend()

fig, axes = plt.subplots(nrows = 2, ncols = 1, sharex = True, figsize = (9, 6))
plt.subplots_adjust(hspace = 0.1)
axes[0].plot(weights_f_kde.support, weights_f_kde.density, label = 'Female')
axes[0].xaxis.tick_top()
axes[0].legend()
axes[1].plot(weights_m_kde.support, weights_f_kde.density, label = 'Male')
axes[1].legend()
plt.xlabel('Weight (lbs.)')

weights = heights_weights['Weight']
plt.plot(heights, weights, '.k', mew = 0, alpha = .1)

lowess_line = lowess(weights, heights)

plt.figure(figsize = (10, 7))
plt.plot(heights, weights, '.', mfc = 'steelblue', mew=0, alpha = .25)
plt.plot(lowess_line[:,0], lowess_line[:, 1],  '-', lw = 2,
         color = '#461B7E', label = "Lowess fit")
plt.legend(loc = "upper left")

heights_weights['Male'] = heights_weights['Gender'].map({'Male': 1, 'Female': 0})
male_logit = logit(formula = 'Male ~ Height + Weight', df = heights_weights).fit()
print male_logit.summary()

male_glm_logit = glm('Male ~ Height + Weight', df = heights_weights,
                     family = sm.families.Binomial(sm.families.links.logit)).fit()
print male_glm_logit.summary()     

logit_pars = male_logit.params
intercept = -logit_pars['Intercept'] / logit_pars['Weight']
slope =  -logit_pars['Height'] / logit_pars['Weight']

fig = plt.figure(figsize = (10, 8))
# Women points (coral)
plt.plot(heights_f, weights_f, '.', label = 'Female', 
         mfc = 'None', mec='coral', alpha = .4)
# Men points (blue)
plt.plot(heights_m, weights_m, '.', label = 'Male', 
         mfc = 'None', mec='steelblue', alpha = .4)
# The separating line
plt.plot(array([50, 80]), intercept + slope * array([50, 80]), 
         '-', color = '#461B7E')
plt.xlabel('Height (in.)')
plt.ylabel('Weight (lbs.)')
plt.legend(loc='upper left')