%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

from pylab import figure, show

from pandas import DataFrame, Series
import pandas as pd

try:
    import mpld3
    from mpld3 import enable_notebook
    from mpld3 import plugins
    enable_notebook()
except Exception as e:
    print "Attempt to import and enable mpld3 failed", e

# what would seaborn do?
try:
    import seaborn as sns
except Exception as e:
    print "Attempt to import and enable seaborn failed", e

import os

NAMES_DIR = os.path.join(os.pardir, "pydata-book", "ch02", "names")

assert os.path.exists(NAMES_DIR)

# show the first five files in the NAMES_DIR

import glob
glob.glob(NAMES_DIR + "/*")[:5]

# 2010 is the last available year in the pydata-book repo
import os

years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = os.path.join(NAMES_DIR, 'yob%d.txt' % year)
    frame = pd.read_csv(path, names=columns)

    frame['year'] = year
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

# why floats?  I'm not sure.
names.describe()

# how many people, names, males and females  represented in names?

names.births.sum()

# F vs M

names.groupby('sex')['births'].sum()

# total number of names

len(names.groupby('name'))

# use pivot_table to collect records by year (rows) and sex (columns)

total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births.head()

# You can use groupy to get equivalent pivot_table calculation

names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).unstack()['births']

# how to calculate the total births / year

names.groupby('year').sum().plot(title="total births by year")

names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).unstack()['births'].plot(title="births (M/F) by year")

# from book: add prop to names

def add_prop(group):
    # Integer division floors
    births = group.births.astype(float)

    group['prop'] = births / births.sum()
    return group

names = names.groupby(['year', 'sex']).apply(add_prop)


# verify prop --> all adds up to 1

np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

# number of records in full names dataframe

len(names)

#  from book: useful to work with top 1000 for each year/sex combo
# can use groupby/apply

names.groupby(['year', 'sex']).apply(lambda g: g.sort_index(by='births', ascending=False)[:1000])

def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]

grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000.head()

# Do pivot table: row: year and cols= names for top 1000

top_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=np.sum)
top_births.tail()

# is your name in the top_births list?

top_births['Raymond'].plot(title='plot for Raymond')

# for Aaden, which shows up at the end

top_births.Aaden.plot(xlim=[1880,2010])

# number of names represented in top_births

len(top_births.columns)

# how to get the most popular name of all time in top_births?

most_common_names = top_births.sum()
most_common_names.sort(ascending=False)

most_common_names.head()

# as of mpl v 0.1 (2014.03.04), the name labeling doesn't work -- so disble mpld3 for this figure

mpld3.disable_notebook()
plt.figure()
most_common_names[:50][::-1].plot(kind='barh', figsize=(10,10))

# turn mpld3 back on

mpld3.enable_notebook()

# instead of top_birth -- get all_births

all_births = names.pivot_table('births', rows='year', cols='name', aggfunc=sum)

all_births = all_births.fillna(0)
all_births.tail()

# set up to do start/end calculation

all_births_cumsum = all_births.apply(lambda s: s.cumsum(), axis=0)

all_births_cumsum.tail()

# remind ourselves of what's in names

names.head()

# columns in names

names.columns

# calculate set of male_only, female_only, ambigender names

def calc_of_sex_of_names():

    k = names.groupby('sex').apply(lambda s: set(list(s['name'])))
    male_only_names = k['M'] - k['F']
    female_only_names = k['F'] - k['M']
    ambi_names = k['F'] & k['M'] # intersection of two 
    return {'male_only_names': male_only_names, 
            'female_only_names': female_only_names,
            'ambi_names': ambi_names }
    
names_by_sex = calc_of_sex_of_names() 
ambi_names_array = np.array(list(names_by_sex['ambi_names']))

[(k, len(v)) for (k,v) in names_by_sex.items()]

# total number of people in names
names.births.sum()

# pivot table of ambigendered names to aggregate 

names_ambi = names[np.in1d(names.name,ambi_names_array)]
ambi_names_pt = names_ambi.pivot_table('births',
                            rows='year', 
                            cols=['name','sex'], 
                            aggfunc='sum')

# total number of people in k1 -- almost everyone!

ambi_names_pt.sum().sum()

# fill n/a with 0 and look at the table at the end

ambi_names_pt=ambi_names_pt.fillna(0L)
ambi_names_pt.tail()

# plot M, F in ambigender_names over time
ambi_names_pt.T.xs('M',level='sex').sum().cumsum()

ambi_names_pt.T.xs('F',level='sex').sum().cumsum()

# don't know what pivot table has type float
# https://github.com/pydata/pandas/issues/3283
ambi_names_pt['Raymond', 'M'].dtype

# calculate proportion of males for given name

def prop_male(name):
    return (ambi_names_pt[name]['M']/ \
    ((ambi_names_pt[name]['M'] + ambi_names_pt[name]['F'])))

def prop_c_male(name):
    return (ambi_names_pt[name]['M'].cumsum()/ \
    ((ambi_names_pt[name]['M'].cumsum() + ambi_names_pt[name]['F'].cumsum())))

prop_c_male('Leslie').plot()

# I couldn't figure out a way of iterating over the names rather than names/sex combo in
# a vectorized way.  

from itertools import islice

names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))

m = [(name_, ambi_names_pt[name_]['M']/(ambi_names_pt[name_]['F'] + ambi_names_pt[name_]['M']))  \
     for name_ in names_to_calc]
p_m_instant = DataFrame(dict(m))
p_m_instant.tail()

# similar calculation except instead of looking at the proportions for a given year only,
# we look at the cumulative number of male/female babies for given name

from itertools import islice

names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))

m = [(name_, ambi_names_pt[name_]['M'].cumsum()/(ambi_names_pt[name_]['F'].cumsum() + ambi_names_pt[name_]['M'].cumsum()))  \
     for name_ in names_to_calc]
p_m_cum = DataFrame(dict(m))
p_m_cum.tail()

p_m_cum['Donnie'].plot()

# some metrics that attempt to measure how a time series s has changed

def min_max_range(s):
    """range of s signed -- positive if slope between two points p +ve and negative
    otherwise; 0 if slope is 0"""
    # note np.argmax, np.argmin returns the position of first occurence of global max, min
    sign = np.sign(np.argmax(s) - np.argmin(s))
    if sign == 0:
        return 0.0
    else:
        return sign*(np.max(s) - np.min(s))

def last_first_diff(s):
    """difference between latest and earliest value"""
    s0 = s.dropna()
    return (s0.iloc[-1] - s0.iloc[0])
    

# population distributions of ambinames 
# might want to remove from consideration instances when total ratio is too great
# or range of existence of a name/sex combo too short

total_pop_ambiname = all_births.sum()[np.in1d(all_births.sum().index, ambi_names_array)]
total_pop_ambiname.sort(ascending=False)
total_pop_ambiname.plot(logy=True)

# now calculate a DataFrame to visualize results

# calculate the total population, the change in p_m from last to first appearance, 
# the change from max to min in p_m, and the percentage of males overall for name

df = DataFrame()
df['total_pop'] = total_pop_ambiname
df['last_first_diff'] = p_m_cum.apply(last_first_diff)
df['min_max_range'] = p_m_cum.apply(min_max_range)
df['abs_min_max_range'] = np.abs(df.min_max_range)
df['p_m'] = p_m_cum.iloc[-1]

# distance from full ambigender -- p_m=0.5 leads to 1, p_m=1 or 0 -> 0
df['ambi_index'] = df.p_m.apply(lambda p: 1 - 2* np.abs(p-0.5))

df.head()

# plot: x -> log10 of total population, y->how p_m has changed from first to last
# turn off d3 for this plot

mpld3.disable_notebook()
plt.scatter(np.log10(df.total_pop), df.last_first_diff, s=1)

# turn d3 back on

mpld3.enable_notebook()

# general directionality counts -- looking for over asymmetry

df.groupby(np.sign(df.last_first_diff)).count()

# let's concentrate on more populous names that have seen big swings in the cumulative p_m

# you can play with the population and range filter
popular_names_with_shifts = df[(df.total_pop>5000) & (df.abs_min_max_range >0.7)]
popular_names_with_shifts.sort_index(by="abs_min_max_range", ascending=False)


popular_names_with_shifts.groupby(np.sign(df.last_first_diff)).count()

#popular_names_with_shifts.to_pickle('popular_names_with_shifts.pickle')

fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
x = np.log10(popular_names_with_shifts.total_pop)
y = popular_names_with_shifts.min_max_range 

scatter = ax.scatter(x, y)

ax.grid(color='white', linestyle='solid')
ax.set_title("Populous Names with Major Sex Shift", size=20)
ax.set_xlabel('log10(total_pop)')
ax.set_ylabel('min_max_range')

#labels = ['point {0}'.format(i + 1) for i in range(len(x))]
labels = list(popular_names_with_shifts.index)
tooltip = plugins.PointLabelTooltip(scatter, labels=labels)
plugins.connect(fig, tooltip)

prop_c_male('Leslie').plot()