%matplotlib inline import re import urllib from zipfile import ZipFile from path import path import numpy as np import pandas as pd from scipy.interpolate import interp1d import matplotlib.pyplot as plt import seaborn as sns from ggplot import * urllib.urlretrieve("http://www.ssa.gov/oact/babynames/names.zip", "names.zip") zf = ZipFile("names.zip") def read_names(f): data = pd.read_csv(zf.open(f), header = None, names = ['name', 'sex', 'n']) data['year'] = int(re.findall(r'\d+', f)[0]) return data bnames = pd.concat([read_names(f) for f in zf.namelist() if f.endswith('.txt')]) bnames.head() urllib.urlretrieve("http://www.ssa.gov/oact/babynames/state/namesbystate.zip", "namesbystate.zip") zf = ZipFile("namesbystate.zip") def read_names2(f): return pd.read_csv(zf.open(f), header = None, names = ['state', 'sex', 'year', 'name', 'n']) bnames_by_state = pd.concat([read_names2(f) for f in zf.namelist() if f.endswith('.TXT')]) bnames_by_state.head() lifetables = pd.read_csv('lifetables.csv') lifetables.head() lifetables_2014 = lifetables[lifetables['year'] + lifetables['x'] == 2014] lifetables_2014.head() def process(d, kind = 'slinear'): f = interp1d(d.year, d.lx, kind) year = np.arange(1900, 2011) lx = f(year) return pd.DataFrame({"year": year, "lx": lx, "sex": d.sex.iloc[1]}) lifetable_2014 = lifetables_2014.\ groupby('sex', as_index = False).\ apply(process) lifetable_2014.head() urllib.urlretrieve("http://www.census.gov/statab/hist/02HS0013.xls", "02HS0013.xls") dat = pd.read_excel('02HS0013.xls', sheetname = 'HS-13', skiprows = range(14)) tot_births = dat.ix[9:101,:2].reset_index(drop = True) tot_births.columns = ['year', 'births'] tot_births = tot_births.convert_objects(convert_numeric = True) tot_births.head() cor_factors = bnames.\ groupby('year', as_index = False).\ sum().\ merge(tot_births) cor_factors['cor'] = cor_factors['births']*1000/cor_factors['n'] cor_factors = cor_factors[['year', 'cor']] cor_factors.head() cor_new = pd.DataFrame({ 'year': range(2002, 2014), 'cor': cor_factors.cor.iloc[-1] }) cor_factors = pd.concat([cor_factors, cor_new])[['year', 'cor']] cor_factors.head() def get_data(name, sex, state = None): if state is None: dat = bnames else: dat = bnames_by_state[(bnames_by_state["state"] == state)] data = dat[(dat['name'] == name) & (dat['sex'] == sex)].\ merge(cor_factors).\ merge(lifetable_2014) data['n_cor'] = data['n']*data['cor'] data['n_alive'] = data['lx']/(10**5)*data['n_cor'] return data get_data('Violet', 'F').head() def plot_name(name, sex, state = None): data = get_data(name, sex, state) return ggplot(data, aes('year', 'n_cor')) +\ geom_line() +\ geom_area(aes(ymin = 0, ymax = 'n_alive'), alpha = 0.5) plot_name("Joseph", "F") plot_name("Violet", "F", "MA") # from the module wquantiles https://github.com/nudomarinero/wquantiles/blob/master/weighted.py import numpy as np def quantile_1D(data, weights, quantile): """ Compute the weighted quantile of a 1D numpy array. Parameters ---------- data : ndarray Input array (one dimension). weights : ndarray Array with the weights of the same size of `data`. quantile : float Quantile to compute. It must have a value between 0 and 1. Returns ------- quantile_1D : float The output value. """ # Check the data if not isinstance(data, np.matrix) : data = np.asarray(data) if not isinstance(weights, np.matrix) : weights = np.asarray(weights) nd = data.ndim if nd != 1: raise TypeError("data must be a one dimensional array") ndw = weights.ndim if ndw != 1: raise TypeError("weights must be a one dimensional array") if data.shape != weights.shape: raise TypeError("the length of data and weights must be the same") if ((quantile > 1.) or (quantile < 0.)): raise ValueError("quantile must have a value between 0. and 1.") # Sort the data ind_sorted = np.argsort(data) sorted_data = data[ind_sorted] sorted_weights = weights[ind_sorted] # Compute the auxiliary arrays Sn = np.cumsum(sorted_weights) # TODO: Check that the weights do not sum zero Pn = (Sn-0.5*sorted_weights)/np.sum(sorted_weights) # Get the value of the weighted median return np.interp(quantile, Pn, sorted_data) def quantile(data, weights, quantile): """ Weighted quantile of an array with respect to the last axis. Parameters ---------- data : ndarray Input array. weights : ndarray Array with the weights. It must have the same size of the last axis of `data`. quantile : float Quantile to compute. It must have a value between 0 and 1. Returns ------- quantile : float The output value. """ # TODO: Allow to specify the axis nd = data.ndim if nd == 0: TypeError("data must have at least one dimension") elif nd == 1: return quantile_1D(data, weights, quantile) elif nd > 1: n = data.shape imr = data.reshape((np.prod(n[:-1]), n[-1])) result = np.apply_along_axis(quantile_1D, -1, imr, weights, quantile) return result.reshape(n[:-1]) def estimate_age(name, sex, state = None): data = get_data(name, sex, state) qs = [1, 0.75, 0.5, 0.25, 0] quantiles = [2014 - int(quantile(data.year, data.n_alive, q)) for q in qs] result = dict(zip(['q0', 'q25', 'q50', 'q75', 'q100'], quantiles)) result['p_alive'] = round(data.n_alive.sum()/data.n_cor.sum()*100, 2) result['sex'] = sex result['name'] = name return pd.Series(result) estimate_age('Gertrude', 'F') estimate_age('Ava', 'F') top_100_names = bnames.\ groupby(['name', 'sex'], as_index = False).\ sum().\ sort('n', ascending = False) top_25_females = top_100_names[(top_100_names["sex"] == "F")] estimates = pd.concat([estimate_age(name, 'F') for name in top_25_females["name"].iloc[:25].tolist()], axis = 1) estimates.T.sort('q50').reset_index()