last_year = 2013 #change this when Social Security database is updated
sex = 'M' # change this to 'F' to do same analysis on girls' names
save_path = "user_last_letters_" + sex # files created by this notebook will be saved in this directory

import os
if not os.path.isdir(save_path): # creates path if it does not exist
    os.makedirs(save_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%run download_and_process.py

# create df_half with only one sex
# aggregate percentages of last letter into df_last

df_half = yob[yob.sex == sex]

df_half = df_half[['name', 'year', 'pct']]
df_half.name = df_half.name.str.lower() #just in case any name ends in a capital letter
pieces = []
for yr in range(1880, last_year + 1):
    loopdf = df_half[df_half.year == yr]
    last_letter = []
    letter_count = []
    for idx, row in loopdf.iterrows():
        current_name = row['name']
        current_length = len(current_name)
        current_pct = row['pct']
        last_letter.append(current_name[current_length-1])
        letter_count.append(current_pct)
    lettersdf = pd.DataFrame(last_letter)
    lettersdf.rename(columns = {0: 'letter'}, inplace=True)
    countsdf = pd.DataFrame(letter_count)
    countsdf.rename(columns = {0: 'pct'}, inplace=True)
    frame = lettersdf.join(countsdf)
    frame['year'] = yr
    pieces.append(frame)

pieces_concat = pd.concat(pieces, ignore_index=True)
df_last = pd.DataFrame(pieces_concat.groupby(['letter', 'year']).pct.sum())
df_last.reset_index(inplace=True, drop=False)

print df_last.tail()

#df_last_max shows most popular letter and its percentage every year
df_last_max = df_last.groupby('year').apply(lambda t: t[t.pct==t.pct.max()])
df_last_max.reset_index(inplace=True, drop=True)
max_overall = df_last_max.pct.max()
print "Tail of 'df_last_max':"
print df_last_max.tail()
print "\nLast letters that were most popular in any given year:",
last_list = list(df_last_max.letter.unique())
for ltr in last_list:
    print ltr,
print "\nMaximum overall popularity of a last letter: %0.2f%%" % max_overall

# Create grid of subplots of last letter frequency by year for each letter
# this version has different scales for each y axis

os.chdir(save_path)

alphalist = list(df_last.groupby(['letter']).pct.mean().rank() / 26)
alphabet = 'abcdefghijklmnopqrstuvwxyz'

fig = plt.figure(figsize=(12, 7), dpi=150, facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace=.4)
for i in range(len(alphabet)):
    curcol = i % 5 + 1
    currow = (i / curcol) + 1
    ltr = alphabet[i]
    subdf = df_last[df_last.letter == ltr]
    xlist = list(subdf.year)
    ylist = list(subdf.pct)
    plt.subplot(5, 6, i + 1)
    plt.xlabel('')
    plt.xlim(1880, last_year)
    plt.tick_params(axis='both', labelsize=0, length=0, width=0, color='#ffffff') 
    plt.ylabel("")
    plt.title(ltr, size = 11)
    plt.plot(xlist,ylist, color='black')
plt.savefig("grid_lastletters_unscaled.png")
plt.show()

os.chdir("../")

# Same plot as above, but with every chart having the same maximum y scale

os.chdir(save_path)

alphalist = list(df_last.groupby(['letter']).pct.mean().rank() / 26)
alphabet = 'abcdefghijklmnopqrstuvwxyz'

fig = plt.figure(figsize=(12, 7), dpi=150, facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace=.4)
for i in range(len(alphabet)):
    curcol = i % 5 + 1
    currow = (i / curcol) + 1
    ltr = alphabet[i]
    subdf = df_last[df_last.letter == ltr]
    xlist = list(subdf.year)
    ylist = list(subdf.pct)
    plt.subplot(5, 6, i + 1)
    plt.xlabel('')
    plt.xlim(1880, last_year)
    plt.ylim(0, max_overall) 
    plt.tick_params(axis='both', labelsize=0, length=0, width=0, color='#ffffff') 
    plt.ylabel("")
    plt.title(ltr, size = 11)
    plt.plot(xlist,ylist, color='black')
plt.savefig("grid_lastletters_scaled.png")
plt.show()

os.chdir("../")

# Some procedures needed for following histograms

# dict of most common last letter each year
last_by_year = {}
for year in range(1880, last_year + 1):
    last_by_year[year] = df_last_max[df_last_max.year == year].letter.iloc[0]

# make dict of maximum values for each letter
# I used this in my blog to add fills to the by-letter plots above
# fills can be done in matplotlib, of course; this was more expedient
alphabet = 'abcdefghijklmnopqrstuvwxyz'
last_max_dict = {}
temppivot = pd.pivot_table(df_last, values='pct', rows='year', cols='letter')
temppivot = temppivot.fillna(value=0)
for ltr in alphabet:
    last_max_dict[ltr] = temppivot[ltr].max()

# make transposed pivot table of years and letters with percent values
# this fills in NaN values for letters which did not appear in database
# at all for certain years; this NaNs are replaced with zeros
dflastpivott = pd.pivot_table(df_last, values='pct', rows='letter', cols='year')
dflastpivott = dflastpivott.fillna(value=0)

# make dict of y positions for histogram letter labels
y_dict = {}
for year in range(1880, last_year + 1):
    ycalc = df_last_max[df_last_max.year == year].pct.iloc[0]
    ycalc = (round(ycalc/2,0)+1) * 2
    if year == 1880:
        y_dict[year] = ycalc
        last_ycalc = ycalc
    else:
        lastlet = last_by_year[year - 1]
        thislet = last_by_year[year]
        if lastlet != thislet:
            y_dict[year] = ycalc
            last_ycalc = ycalc
        else:
            if ycalc < last_ycalc: ycalc = last_ycalc
            last_ycalc = ycalc
            y_dict[year] = ycalc

# Creates histograms, one per year

save_histograms = False # Change to true to save histogram as png grappics

use_full_range = False   ## Change to true to get histograms of the entire dataset;
                         ## This was implemented so the GitHub/nbviewer version would not be too huge

if use_full_range == False:
    start_year = 1943  ##  Change if desired
    end_year = 2013    ##  Change if desired
    skip_year = 10     ##  Change if desired; at 10, histograms will be made one every decade
else:
    start_year = 1880
    end_year = last_year
    skip_year = 1           ## These values will print every year
    
os.chdir(save_path)

alphabet = 'abcdefghijklmnopqrstuvwxyz'
alphadict = {}

for i in range(26):
    alphadict[alphabet[i]] = i

for year in range(start_year, end_year + 1, skip_year): # Change to a list of years to cut down on the number of histograms shown
    percentlist = list(dflastpivott[year])
    maxlet = last_by_year[year]
    ind = np.arange(26)  # the x locations for the groups
    width = 0.9       # the width of the bars
    fig = plt.figure(figsize=(10, 6), dpi=150, facecolor='w', edgecolor='k')
    ax = plt.subplot(111)
    barlist = plt.bar(range(26), percentlist, width=width, color='#aa4444')
    barlist[alphadict[maxlet]].set_color('#000088')
    ax.set_xticks(np.arange(26) + width/2)
    ax.set_xticklabels( ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') )
    ax.set_ylabel('% of names ending with letter', size = 14)
    ax.set_title("Distribution of last letter in newborn boys' names", size=20, color="#222222")
    plt.annotate(year, xy=(.98, .96),  xycoords='axes fraction', size = 32, color='#aa4444', horizontalalignment='right', verticalalignment='top')
    plt.annotate("Source: U.S. Social Security Administration", xy=(0.05, 0.03),  xycoords='figure fraction', size = 10, horizontalalignment='left', verticalalignment='bottom')
    plt.annotate("prooffreader.com", xy=(0.95, 0.03),  xycoords='figure fraction', size = 13, horizontalalignment='right', verticalalignment='bottom')
    plt.annotate(maxlet, xy=(alphadict[maxlet]+0.45, y_dict[year]-0.5), xycoords='data', size='19', color = "#000088", horizontalalignment='center', verticalalignment='bottom')
    
    plt.ylim(0, 40)
    plt.xlim(0, 26)
    ax.xaxis.set_tick_params(width=0)
  
    if save_histograms == True:
        plt.savefig("histogram_last_letter_%s_%d.png" % (sex, year))
    plt.show()
    plt.close()
    
os.chdir("../")

last_year = 2013 #change this when Social Security database is updated
sex = 'M' # change this to 'F' to do same analysis on girls' names
save_path = "last_letters_" + sex # files created by this notebook will be saved in this directory

import os
if not os.path.isdir(save_path): # creates path if it does not exist
    os.makedirs(save_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%run download_and_process.py

# create df_quint dataframe
df_quint = yob[yob.sex == 'M']
df_quint.sort(columns=['year', 'births'], ascending=[True, False], inplace=True)
df_quint['cumul_sum'] = df_quint.groupby('year').pct.cumsum()
df_quint['quintile'] = 0
df_quint['endsn'] = 0
for idx, row in df_quint.iterrows():
     if df_quint['name'][idx][-1] == 'n': df_quint['endsn'][idx] = 1
     df_quint['quintile'][idx] = (df_quint['cumul_sum'][idx] / 20) + 1
     if df_quint['quintile'][idx] == 6: df_quint['quintile'][idx] = 5 # quick inelegant fix to fencepost error
# note that if a name straddles a quintile, it is put in the lower quintile
# one could distribute such names between quintiles, but doing so does not add anything substantive to the analysis

print "Tail of df_quint:"
print df_quint.tail()

# create dicts for graphs

by_year_n = {} # dict with key of year, quintile tuple and total percentage of names ending with n
by_year_notn = {} # same as above but for names not ending in n
by_year_propn = {} # same as above but proportion ending in n
for i in range(1880, last_year + 1):
    for q in range(1,6):
        by_year_n[(i, q)] = 0
        by_year_notn[(i, q)] = 0
for idx, row in df_quint.iterrows():
    if df_quint['endsn'][idx] == 1:
        by_year_n[(df_quint['year'][idx],df_quint['quintile'][idx])] += df_quint['births'][idx]
    else:
        by_year_notn[(df_quint['year'][idx],df_quint['quintile'][idx])] += df_quint['births'][idx]
for i in range(1880, last_year + 1):
    for q in range(1,6):
        by_year_propn[(i, q)] = 1.0 * by_year_n[(i, q)] / (by_year_n[(i, q)] + by_year_notn[(i, q)])

overall_n = {}
overall_notn = {}
for i in range(1880, last_year + 1):
        overall_n[i] = 0
        overall_notn[i] = 0
for idx, row in df_quint.iterrows():
    if df_quint['endsn'][idx] == 1:
        overall_n[df_quint['year'][idx]] += df_quint['births'][idx]
    else:
        overall_notn[df_quint['year'][idx]] += df_quint['births'][idx]
overall_propn = {}
for i in range(1880, last_year + 1):
        overall_propn[i] = 1.0 * overall_n[i] / (overall_n[i] + overall_notn[i])

# import seaborn # Uncomment if you have seaborn installed and want nicer-looking graphs

# note that much of the graphics processing for the graphs shown on prooffreader.com
# was done after the fact in Photoshop

# the top five graphs are the five quintiles and the bottom is the overall frequency of names ending in n

os.chdir(save_path)

fig = plt.figure(figsize=(12, 12), dpi=150, facecolor='w', edgecolor='k')
#fig.subplots_adjust(hspace=.4)
for q in range(1, 6):
    listx = []
    listy = []
    for yr in range(1880, last_year + 1):
        listx.append(yr)
        listy.append(by_year_propn[(yr, q)])
    plt.subplot(6, 1, q)
    plt.xlabel('')
    plt.xlim(1880, last_year)
    plt.ylim(0, 1)
    plt.ylabel("")
    plt.title(' ', size = 11)
    plt.plot(listx,listy, color='black')
listx = []
listy = []
for yr in range(1880, last_year + 1):
    listx.append(yr)
    listy.append(overall_propn[yr])
plt.subplot(6, 1, 6)
plt.xlabel('')
plt.xlim(1880, last_year)
plt.ylim(0, 1)
plt.ylabel("")
plt.title(' ', size = 11)
plt.plot(listx,listy, color='black')
plt.savefig("quintiles_n_" + sex + ".png")
plt.show()

os.chdir("../")

df_quint.to_pickle("last_letters_" + sex + "/df_quint_" + sex + ".pickle")