%matplotlib inline
import re
import urllib
from zipfile import ZipFile
from path import path
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *

urllib.urlretrieve("http://www.ssa.gov/oact/babynames/names.zip", "names.zip")
zf = ZipFile("names.zip")
def read_names(f):
  data = pd.read_csv(zf.open(f), header = None, names = ['name', 'sex', 'n'])
  data['year'] = int(re.findall(r'\d+', f)[0])
  return data
  
bnames = pd.concat([read_names(f) for f in zf.namelist() if f.endswith('.txt')])
bnames.head()

urllib.urlretrieve("http://www.ssa.gov/oact/babynames/state/namesbystate.zip", "namesbystate.zip")
zf = ZipFile("namesbystate.zip")
def read_names2(f):
  return pd.read_csv(zf.open(f), header = None, names = ['state', 'sex', 'year', 'name', 'n'])
  
bnames_by_state = pd.concat([read_names2(f) for f in zf.namelist() if f.endswith('.TXT')])
bnames_by_state.head()

lifetables = pd.read_csv('lifetables.csv')
lifetables.head()

lifetables_2014 = lifetables[lifetables['year'] + lifetables['x'] == 2014]
lifetables_2014.head()

def process(d, kind = 'slinear'):
  f = interp1d(d.year, d.lx, kind)
  year = np.arange(1900, 2011)
  lx = f(year)
  return pd.DataFrame({"year": year, "lx": lx, "sex": d.sex.iloc[1]})
lifetable_2014 = lifetables_2014.\
  groupby('sex', as_index = False).\
  apply(process)
lifetable_2014.head()

urllib.urlretrieve("http://www.census.gov/statab/hist/02HS0013.xls", "02HS0013.xls")
dat = pd.read_excel('02HS0013.xls', sheetname = 'HS-13', skiprows = range(14))
tot_births = dat.ix[9:101,:2].reset_index(drop = True)
tot_births.columns = ['year', 'births']
tot_births = tot_births.convert_objects(convert_numeric = True)
tot_births.head()

cor_factors = bnames.\
  groupby('year', as_index = False).\
  sum().\
  merge(tot_births)
cor_factors['cor'] = cor_factors['births']*1000/cor_factors['n']
cor_factors = cor_factors[['year', 'cor']]
cor_factors.head()

cor_new = pd.DataFrame({
  'year': range(2002, 2014),
  'cor': cor_factors.cor.iloc[-1]
})
cor_factors = pd.concat([cor_factors, cor_new])[['year', 'cor']]
cor_factors.head()

def get_data(name, sex, state = None):
  if state is None:
    dat = bnames
  else:
    dat = bnames_by_state[(bnames_by_state["state"] == state)]
  data = dat[(dat['name'] == name) & (dat['sex'] == sex)].\
    merge(cor_factors).\
    merge(lifetable_2014)
  data['n_cor'] = data['n']*data['cor']
  data['n_alive'] = data['lx']/(10**5)*data['n_cor']
  return data
get_data('Violet', 'F').head()

def plot_name(name, sex, state = None):
  data = get_data(name, sex, state)
  return ggplot(data, aes('year', 'n_cor')) +\
    geom_line() +\
    geom_area(aes(ymin = 0, ymax = 'n_alive'), alpha = 0.5)
plot_name("Joseph", "F")

plot_name("Violet", "F", "MA")

# from the module wquantiles https://github.com/nudomarinero/wquantiles/blob/master/weighted.py
import numpy as np
def quantile_1D(data, weights, quantile):
    """
    Compute the weighted quantile of a 1D numpy array.

    Parameters
    ----------
    data : ndarray
        Input array (one dimension).
    weights : ndarray
        Array with the weights of the same size of `data`.
    quantile : float
        Quantile to compute. It must have a value between 0 and 1.

    Returns
    -------
    quantile_1D : float
        The output value.
    """
    # Check the data
    if not isinstance(data, np.matrix) :
        data = np.asarray(data)
    if not isinstance(weights, np.matrix) :
        weights = np.asarray(weights)
    nd = data.ndim
    if nd != 1:
        raise TypeError("data must be a one dimensional array")
    ndw = weights.ndim
    if ndw != 1:
        raise TypeError("weights must be a one dimensional array")
    if data.shape != weights.shape:
        raise TypeError("the length of data and weights must be the same")
    if ((quantile > 1.) or (quantile < 0.)):
        raise ValueError("quantile must have a value between 0. and 1.")
    # Sort the data
    ind_sorted = np.argsort(data)
    sorted_data = data[ind_sorted]
    sorted_weights = weights[ind_sorted]
    # Compute the auxiliary arrays
    Sn = np.cumsum(sorted_weights)
    # TODO: Check that the weights do not sum zero
    Pn = (Sn-0.5*sorted_weights)/np.sum(sorted_weights)
    # Get the value of the weighted median
    return np.interp(quantile, Pn, sorted_data)


def quantile(data, weights, quantile):
    """
    Weighted quantile of an array with respect to the last axis.

    Parameters
    ----------
    data : ndarray
        Input array.
    weights : ndarray
        Array with the weights. It must have the same size of the last 
        axis of `data`.
    quantile : float
        Quantile to compute. It must have a value between 0 and 1.

    Returns
    -------
    quantile : float
        The output value.
    """
    # TODO: Allow to specify the axis
    nd = data.ndim
    if nd == 0:
        TypeError("data must have at least one dimension")
    elif nd == 1:
        return quantile_1D(data, weights, quantile)
    elif nd > 1:
        n = data.shape
        imr = data.reshape((np.prod(n[:-1]), n[-1]))
        result = np.apply_along_axis(quantile_1D, -1, imr, weights, quantile)
        return result.reshape(n[:-1])

def estimate_age(name, sex, state = None):
  data = get_data(name, sex, state)
  qs = [1, 0.75, 0.5, 0.25, 0]
  quantiles = [2014 - int(quantile(data.year, data.n_alive, q)) for q in qs]
  result = dict(zip(['q0', 'q25', 'q50', 'q75', 'q100'], quantiles))
  result['p_alive'] = round(data.n_alive.sum()/data.n_cor.sum()*100, 2)
  result['sex'] = sex
  result['name'] = name
  return pd.Series(result)
  
estimate_age('Gertrude', 'F')

estimate_age('Ava', 'F')

top_100_names = bnames.\
  groupby(['name', 'sex'], as_index = False).\
  sum().\
  sort('n', ascending = False)
top_25_females = top_100_names[(top_100_names["sex"] == "F")]
estimates = pd.concat([estimate_age(name, 'F') for name in top_25_females["name"].iloc[:25].tolist()], axis = 1)
estimates.T.sort('q50').reset_index()