#Python/Pandas script to analyze baby names database 1880-2012
#from the U.S. Social Security Administration
#
#By David Taylor Feb. 2014
#
#www.prooffreader.com (for blogged results)
#
#prooffreaderplus.blogspot.com (for scripts, calculations, links, gits, etc.)
#
#Disclaimer: not a professional programmer, more interested right now in results in a reasonable time.
#  That said, constructive critique and suggestions are always totally welcome. I'm not proud.
#  In particular, there is a lot of very needless duplication of boy and girl databases and code
#    in loops that refer to them, when I could just subset a larger database every time,
#    but I have enough memory in my computer and not enough in my head so I just found it easier
#    to work in this inefficient fashion. Your mileage may vary.
#  Note that I usually use column names instead of indexes, my brain just deals with them better
#    right now, as I get more used to pandas I'm already starting to adapt.
#  Also, sometimes I just print-dump everything to a csv file and work with it in Excel. Sorry!

#Instructions:
#1. Download data set at (as of Feb. 2014) http://www.ssa.gov/OACT/babynames/names.zip
#2. Unzip into a working directory
#3. Change the working directory strings in this script.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re as re
import scipy
import os

os.chdir("C:/_Dropbox/Dropbox/py/babynames/yobs") #change this to your working directory
    
#read yob files, arrange into data frames and concatenate into one data frame
#this portion taken from O'Reilly's Python for Data Analysis (2009)
years = range(1880, 2013) #remember in python a 2013 upper bound means the last value used will be 2012
pieces = []
yobcolumns = ['name', 'sex', 'births']
for year in years:
    path = 'yob%d.txt' % year
    frame = pd.read_csv(path, names=yobcolumns)
    frame['year'] = year
    pieces.append(frame)
df = pd.concat(pieces, ignore_index=True)

os.chdir("C:/_Dropbox/Dropbox/py/babynames/") #change this to your working directory

#add column 'pct' that is the number of births of that name and sex in that year
#divided by the total number of births of that sex in that year, multiplied by
#100 to turn into a percentage and reduce all those leading zeroes
def add_pct(group):
    births = group.births.astype(float)
    group['pct'] = (births / births.sum() * 100)
    return group
df = df.groupby(['year', 'sex']).apply(add_pct)

#add rank of each name each year each sex
df['ranked'] = df.groupby(['year', 'sex'])['births'].rank(ascending=False)

#subset girls and boys
dff = df[df.sex == 'F']
dfm = df[df.sex == 'M']

#create names dataframe. This DF discards individual birth or pct values, and instead collects data on unique names.
#There is one row per unique combination of name and sex.
temp_count = pd.DataFrame(data=dff['name'].value_counts(), columns=['year_count'])
temp_min = pd.DataFrame(data=dff.groupby('name').year.min(), columns = ['year_min'])
temp_max = pd.DataFrame(data=dff.groupby('name').year.max(), columns = ['year_max'])
temp_pctsum = pd.DataFrame(data=dff.groupby('name').pct.sum(), columns = ['pct_sum'])
temp_pctmax = pd.DataFrame(data=dff.groupby('name').pct.max(), columns = ['pct_max'])
temp_f = temp_count.join(temp_min)
temp_f = temp_f.join(temp_max)
temp_f = temp_f.join(temp_pctsum)
temp_f = temp_f.join(temp_pctmax)
temp_f['sex'] = "F"
temp_f.reset_index(inplace=True, drop=False)
temp_f.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
temp_f = temp_f[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
temp_count = pd.DataFrame(data=dfm['name'].value_counts(), columns=['year_count'])
temp_min = pd.DataFrame(data=dfm.groupby('name').year.min(), columns = ['year_min'])
temp_max = pd.DataFrame(data=dfm.groupby('name').year.max(), columns = ['year_max'])
temp_pctsum = pd.DataFrame(data=dfm.groupby('name').pct.sum(), columns = ['pct_sum'])
temp_pctmax = pd.DataFrame(data=dfm.groupby('name').pct.max(), columns = ['pct_max'])
temp_m = temp_count.join(temp_min)
temp_m = temp_m.join(temp_max)
temp_m = temp_m.join(temp_pctsum)
temp_m = temp_m.join(temp_pctmax)
temp_m['sex'] = "M"
temp_m.reset_index(inplace=True, drop=False)
temp_m.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
temp_m = temp_m[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
names = pd.concat([temp_f, temp_m], ignore_index=True)

# create years dataframe. This DF discards individual name data, aggregating by year.
total = pd.DataFrame(df.pivot_table('births', rows='year', cols = 'sex', aggfunc=sum))
total.reset_index(drop=False, inplace=True)
total.columns = ['year', 'births_f', 'births_m']
total['births_t'] = total.births_f + total.births_m
newnames = pd.DataFrame(data=names.groupby('year_min').year_min.count(), columns = ['firstyearcount'])
newnames.reset_index(drop=False, inplace=True)
newnames.columns = ['year', 'new_names']
uniquenames = pd.DataFrame(columns=['year', 'unique_names'])
for yr in range(1880, 2013):
    uniquenames = uniquenames.append(pd.DataFrame([{'year':yr, 'unique_names':len(unique(df[df.year == yr].name))}]), ignore_index=True)
years = pd.merge(left=total, right=newnames, on='year', right_index=False, left_index=False)
years = pd.merge(left=years, right=uniquenames, on='year', right_index=False, left_index=False)

#births dataframes, just the number of births per year
births = df.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
births_f = dff.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
births_m = dfm.pivot_table('births', rows='year', cols='sex', aggfunc=sum)

# It takes my $400 Acer desktop computer about 15 seconds for this cell. If you want to compare, put the following, uncommented,
# as the first line:
# %%timeit

# Some custom functions to help exploration:

def headtail(df, num = 5):
    """ Returns concatenated head and tail of dataframe
    arguments: dataframe, integer number of rows in each of head and tail """
    return pd.concat([df.head(num), df.tail(num)], ignore_index=False)

def subset(df, returncol='', searchval = '', searchcol = ''):
    """ Returns dataframe in first argument
    with column named in second argument (or entire row if omitted)
    and value named in third argument (or entire column(s) if omitted
    found in column named in fourth argument (or second argument if omitted)"""
    if returncol == '':
        if searchval <> '' and searchcol <> '':
            return df[df[searchcol] == searchval]
        else:
            return 'Error in argument formulation' 
    else:
        if searchval == '':
            if searchcol == '':
                return df[returncol]
            else:
                return 'Error in argument formulation'
        else:
            if searchcol == '':
                return df[df[returncol] == searchval][returncol]
            else:
                return df[df[searchcol] == searchval][returncol]

#Main dataframe of SSA data:
headtail(df)

print df.describe()

#for comparison with Dept of Health and Human Services database:
dhscompare = []
for yr in range(1910,2010):
    dhscompare.append(yr)
    dhscompare.append(subset(df, 'births', yr, 'year').sum())
dhscompare
#normally this would be better as a dict, but for brevity's sake I output as a list because the DHS data is in Excel
#and I already have an excel macro to parse a list like this.

#verify that dff and dfm are just df split into sex == F and M, respectively.

if len(df) == len(dff) + len(dfm):
    print "Dataframes add up correctly: " + str(len(dff)) + " girls + " + str(len(dfm)) + " boys = " + str(len(df)) + " total."
else:
    print "Dataframes do not add up: " + str(len(dff)) + " girls + " + str(len(dfm)) + " boys != " + str(len(df)) + " total; difference - " + str(len(df) - len(dfm) - len(dff))


#names dataframe
#there are 101260 unique name-sex combinations; e.g., "Jeon" for a boy occurred once only, in 1999.
#births and pct and rank have been discarded, but can easily be looked up from df or dfm.
headtail(names)

print names.describe()

#years dataframe
headtail(years)

print years.describe()

#Make dataframe and graph of first-ranked
rank1m = dfm[dfm.ranked == 1]
rank1f = dff[dff.ranked == 1]
zeroline = scipy.zeros(len(range(1880, 2013)))
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='k')
plt.plot(rank1m.year, rank1m.pct, color="blue", linewidth = 2, label = 'Boys')
plt.fill_between(rank1m.year, rank1m.pct, color="blue", alpha = 0.1, interpolate=True)
plt.xlim(1880,2012)
plt.ylim(0,9)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of #1 boys' name by year", size=18, color="blue")
plt.xlabel('Year', size=15)
plt.ylabel('% of male births', size=15)
plt.show()
plt.close()
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='k')
plt.plot(rank1f.year, rank1f.pct, color="red", linewidth = 2, label = 'Girls')
plt.fill_between(rank1f.year, rank1f.pct, color="red", alpha = 0.1, interpolate=True)
plt.xlim(1880,2012)
plt.ylim(0,9)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of #1 girls' name by year", size=18, color="red")
plt.xlabel('Year', size=15)
plt.ylabel('% of female births', size=15)
plt.show()
plt.close()


# make graphs of gender miscategorizations for all names that were #1 ranked in any year

namelist = list(rank1m.name.unique()) + list(rank1f.name.unique())

for nm in namelist:
    f_or_m = df[df.name == nm]
    f_or_m = pd.pivot_table(f_or_m, 'pct', rows=['year'], cols='sex')
    f_or_m = pd.DataFrame(f_or_m)
    f_or_m = f_or_m.dropna()
    f_or_m['temp'] = f_or_m.F / f_or_m.M
    f_or_m['sex_max'] = f_or_m['temp'].apply(lambda x: 'F' if x >= 1 else 'M')
    f_or_m['temp2'] = f_or_m['sex_max'].apply(lambda x: -1 if x =='F' else 1)
    f_or_m['pctratio'] = 100 * (f_or_m.temp ** f_or_m.temp2)
    y2 = f_or_m.pctratio
    y1 = f_or_m[f_or_m.iloc[0]['sex_max']]
    x = f_or_m.index
    
    if f_or_m.iloc[0]['sex_max'] == 'M':
        sexvar = 'female'
    else:
        sexvar = 'male'

    figure(num=None, figsize=(6,4), dpi=150, facecolor='w', edgecolor='k')
    
    plt.subplot(211)
    plt.xlim(1880,2012)
    plt.xticks([])
    plt.ylabel('% of all births', size=10)
    
    if y1[[1980, 1985, 1990, 1995, 2000, 2005, 2010]].max() / y1.max() < 0.5: #puts name on the left if name was popular in year 2000
        plt.annotate(nm, xy=(.96, .92),  xycoords='axes fraction', size = 17, horizontalalignment='right', verticalalignment='top')
    else:
        plt.annotate(nm, xy=(.05, .92),  xycoords='axes fraction', size = 17, horizontalalignment='left', verticalalignment='top')

    plt.plot(x, y1, color='black', linewidth = 2, label = '2')
    
    plt.subplot(212)
    plt.xlim(1880,2012)
    plt.ylabel('% ' + nm + ' ' + sexvar, size=10)
    plt.plot(x, y2, color='green', linewidth = 2, label = '2')
    plt.savefig('fm_' + nm + '.png')
    plt.show()
    plt.close()

# Note: algorithm does not work for Ashley, because it changed from a male to a female name (see below)

#Try to solve the mystery of the high error rate of Emma ca. 1900
#posted on Pastebin and prooffreaderplus.blogspot.com
dfm1900 = subset(dfm, '', 1900, 'year')
dfm1900.reset_index(drop=True, inplace=True)
dfm1900E = dfm1900[dfm1900.name.str.contains('^E')]
dfm1900E_a = dfm1900E[dfm1900E.name.str.contains('a$')]
dfm1900_a = dfm1900[dfm1900.name.str.contains('a$')]
print 'Boys born in 1900 whose names begin with E:\n'
print dfm1900E[['name', 'births', 'ranked']].head(60)
print '\n\nBoys born in 1900 whose names end with a:\n'
print dfm1900_a[['name', 'births', 'ranked']].head(50)
print '\n\nIntersection of both sets:\n'
print dfm1900E_a[['name', 'births', 'ranked']].head(50)

dff[dff.name == 'Ashley'].head()

dfm[dfm.name == 'Ashley'].head() #Added on March 3

# plot percent male and female births of Ashley
nm = 'Ashley'
f_and_m = df[df.name == nm]
f_and_m = pd.pivot_table(f_and_m, 'pct', rows=['year'], cols='sex')
f_and_m = pd.DataFrame(f_and_m)

figure(num=None, figsize=(8,7), dpi=150, facecolor='w', edgecolor='k')

plt.xlim(1880,2012)

plt.subplot(211)
#plt.title(nm)
plt.title("Ashley used to be a boys' name\n", size=20)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(f_and_m.index, f_and_m.F, color='#cc0000', linewidth = 2, label = 'Girls')
plt.plot(f_and_m.index, f_and_m.M, color='#0000cc', linewidth = 2, label = 'Boys')
plt.legend(loc = 'upper left')
plt.subplot(212)
plt.ylim(0, 0.05)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(f_and_m.index, f_and_m.F, color='#cc0000', linewidth = 2, label = 'Girls')
plt.plot(f_and_m.index, f_and_m.M, color='#0000cc', linewidth = 2, label = 'Boys')
plt.annotate('magnification of top graph\n(10% of height of first tick mark)\nto see male names', xy=(.03, .95),  xycoords='axes fraction', size = 12, horizontalalignment='left', verticalalignment='top')
plt.savefig('ashleyMF.png')
plt.show()
plt.close()


name_chosen = "Sigourney"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"


import scipy
graphdf = df[df.sex == sex_chosen]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(11, 8), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="black", linewidth = 2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of baby name '" + name_chosen + "', 1880-2012", size=18, color="black")
plt.xlabel('Year', size=15)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
#plt.savefig(name_chosen + '.png') # un-comment to save graph
plt.show()
plt.close()
#graphdf.to_csv(name_chosen + "_" + sex_chosen + ".csv")

name_chosen = "Sharona"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"


import scipy
graphdf = df[df.sex == sex_chosen]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(8, 4), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="black", linewidth = 2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of baby name '" + name_chosen + "', 1880-2012", size=18, color="black")
plt.xlabel('Year', size=15)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
plt.savefig(name_chosen + '.png') # un-comment to save graph
plt.show()
plt.close()
#graphdf.to_csv(name_chosen + "_" + sex_chosen + ".csv")

#compare two names of same sex
name_chosen = "Marilyn"
name_chosen_2 = "Norma"
sex_chosen = "F"

if sex_chosen == "F":
    y_axis_sex = "female"
else:
    y_axis_sex = "male"

import scipy
graphdf = df[df.sex == sex_chosen]
graphdf2 = graphdf[graphdf.name == name_chosen_2]
graphdf = graphdf[graphdf.name == name_chosen]

figure(num=None, figsize=(7.5, 5), dpi=300, facecolor='w', edgecolor='w')
plt.plot(graphdf.year, graphdf.pct, color="#6500A4", linewidth = 3, label=name_chosen)
plt.plot(graphdf2.year, graphdf2.pct, color="#005E75", linewidth = 3, label=name_chosen_2)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("Popularity of U.S. baby names\n" + name_chosen + " and " + name_chosen_2 + ", 1880-2012", size=16, color="black")
plt.xlabel('Year', size=12)
plt.ylabel('Percent of ' + y_axis_sex + ' births', size=14)
plt.legend(loc = 'upper left')
plt.savefig(name_chosen + '_v_' + name_chosen_2 + '.png') # un-comment to save graph
plt.show()
plt.close()

# plot rate of babies named "Unknown" or "Baby"
Unkn = df[df.name == 'Unknown']
Unkn = pd.pivot_table(Unkn, 'pct', rows=['year'], cols='name')
Unkn = pd.DataFrame(Unkn)
Baby = df[df.name == 'Baby']
Baby = pd.pivot_table(Baby, 'pct', rows=['year'], cols='name')
Baby = pd.DataFrame(Baby)

figure(num=None, figsize=(8,4), dpi=150, facecolor='w', edgecolor='k')

plt.xlim(1880,2012)

plt.title("U.S. babies named 'Unknown' or 'Baby', 1880-2012", size=18)
plt.xlim(1880, 2012)
plt.ylabel('% of births', size=10)
plt.plot(Unkn.index, Unkn.Unknown, color='#660066', linewidth = 2, label = 'Unknown')
plt.plot(Baby.index, Baby.Baby, color='#dd8833', linewidth = 2, label = 'Baby')
plt.legend(loc = 'upper left')
plt.savefig('Unknown_Baby.png')
plt.show()
plt.close()

years2 = years[years.year > 1880] # the first year, 1880, mucks up the 'new names' data
newperthou = years2.new_names * 1000.0 / years2.births_t
import scipy
zeroline = scipy.zeros(len(range(1881, 2013)))
figure(num=None, figsize=(15, 6), dpi=150, facecolor='w', edgecolor='w')
plt.plot(years2.year, newperthou, color="#0A5711", linewidth = 2)
plt.fill_between(years2.year, zeroline, newperthou, color="#20912B", alpha = 1.0, interpolate=True)
plt.xlim(1880,2012)
plt.xticks(scipy.arange(1880,2012,10))
plt.title("New names introduced per 1000 births", size=18, color="#0A5711")
plt.xlabel('Year', size=15)
plt.ylabel('Names appearing for first time / 1000 births', size=11)
plt.show()
plt.close()