%pylab inline import numpy as np from pandas import * import matplotlib.pyplot as plt import os import statsmodels.api as sm from statsmodels.nonparametric.kde import KDE from statsmodels.nonparametric.smoothers_lowess import lowess from statsmodels.formula.api import logit, glm heights_weights = read_table('data/01_heights_weights_genders.csv', sep = ',', header = 0) print heights_weights.head() print heights_weights['Gender'].value_counts() heights = heights_weights['Height'] heights.describe() def my_mean(x): return float(np.sum(x)) / len(x) def my_median(x): ''' Compute the median of a series x. ''' # Get a sorted copy of the values in the series (need to call values # otherwise integer indexing messes things up.) sorted_x = np.sort(x.values) if len(x) % 2 == 0: indices = [0.5 * len(x) - 1, 0.5 * len(x)] return np.mean(sorted_x[indices]) else: # Ceil(x) - 1 = Floor(x), but this is to make clear that the -1 is to # account for 0-based counting. index = ceil(0.5 * len(x)) - 1 return sorted_x.ix[index] my_mean(heights) - heights.mean() my_median(heights) - heights.median() print 'Range of heights (in.): ', heights.min(), heights.max() # Range = max - min. Note: np.ptp(heights.values) will do the same thing. # HT Nathaniel Smith def my_range(s): ''' Difference between the max and min of an array or Series ''' return s.max() - s.min() print 'Range width of heights (in.) (my_range()): ', my_range(heights) print 'Range width of heights (in.) (np.ptp()): ',np.ptp(heights) heights.quantile(.5) def my_quantiles(s, prob = (0.0, 0.25, 0.5, 1.0)): ''' Calculate quantiles of a series. Parameters: ----------- s : a pandas Series prob : a tuple (or other iterable) of probabilities at which to compute quantiles. Must be an iterable, even for a single probability (e.g. prob = (0.50) not prob = 0.50). Returns: -------- A pandas series with the probabilities as an index. ''' q = [s.quantile(p) for p in prob] return Series(q, index = prob) # With the default prob argument my_quantiles(heights) my_quantiles(heights, prob = arange(0, 1.1, 0.1)) def my_var(x): return np.sum((x - x.mean())**2) / (len(x) - 1) my_var(heights) - heights.var() def my_sd(x): return np.sqrt(my_var(x)) my_sd(heights) - heights.std() # 1-inch bins bins1 = np.arange(heights.min(), heights.max(), 1.) heights.hist(bins = bins1, fc = 'steelblue') # 5-inch bins bins5 = np.arange(heights.min(), heights.max(), 5.) heights.hist(bins = bins5, fc = 'steelblue') # 0.001-inch bins bins001 = np.arange(heights.min(), heights.max(), .001) heights.hist(bins = bins001, fc = 'steelblue') heights_kde = KDE(heights) heights_kde.fit() fig = plt.figure() plt.plot(heights_kde.support, heights_kde.density) heights_m = heights[heights_weights['Gender'] == 'Male'].values heights_f = heights[heights_weights['Gender'] == 'Female'].values heights_m_kde = KDE(heights_m) heights_f_kde = KDE(heights_f) heights_m_kde.fit() heights_f_kde.fit() fig = plt.figure() plt.plot(heights_m_kde.support, heights_m_kde.density, label = 'Male') plt.plot(heights_f_kde.support, heights_f_kde.density, label = 'Female') plt.legend() weights_m = heights_weights[heights_weights['Gender'] == 'Male']['Weight'].values weights_f = heights_weights[heights_weights['Gender'] == 'Female']['Weight'].values weights_m_kde = KDE(weights_m) weights_f_kde = KDE(weights_f) weights_m_kde.fit() weights_f_kde.fit() fig = plt.figure() plt.plot(weights_m_kde.support, weights_f_kde.density, label = 'Male') plt.plot(weights_f_kde.support, weights_f_kde.density, label = 'Female') plt.legend() fig, axes = plt.subplots(nrows = 2, ncols = 1, sharex = True, figsize = (9, 6)) plt.subplots_adjust(hspace = 0.1) axes[0].plot(weights_f_kde.support, weights_f_kde.density, label = 'Female') axes[0].xaxis.tick_top() axes[0].legend() axes[1].plot(weights_m_kde.support, weights_f_kde.density, label = 'Male') axes[1].legend() plt.xlabel('Weight (lbs.)') weights = heights_weights['Weight'] plt.plot(heights, weights, '.k', mew = 0, alpha = .1) lowess_line = lowess(weights, heights) plt.figure(figsize = (10, 7)) plt.plot(heights, weights, '.', mfc = 'steelblue', mew=0, alpha = .25) plt.plot(lowess_line[:,0], lowess_line[:, 1], '-', lw = 2, color = '#461B7E', label = "Lowess fit") plt.legend(loc = "upper left") heights_weights['Male'] = heights_weights['Gender'].map({'Male': 1, 'Female': 0}) male_logit = logit(formula = 'Male ~ Height + Weight', df = heights_weights).fit() print male_logit.summary() male_glm_logit = glm('Male ~ Height + Weight', df = heights_weights, family = sm.families.Binomial(sm.families.links.logit)).fit() print male_glm_logit.summary() logit_pars = male_logit.params intercept = -logit_pars['Intercept'] / logit_pars['Weight'] slope = -logit_pars['Height'] / logit_pars['Weight'] fig = plt.figure(figsize = (10, 8)) # Women points (coral) plt.plot(heights_f, weights_f, '.', label = 'Female', mfc = 'None', mec='coral', alpha = .4) # Men points (blue) plt.plot(heights_m, weights_m, '.', label = 'Male', mfc = 'None', mec='steelblue', alpha = .4) # The separating line plt.plot(array([50, 80]), intercept + slope * array([50, 80]), '-', color = '#461B7E') plt.xlabel('Height (in.)') plt.ylabel('Weight (lbs.)') plt.legend(loc='upper left')