last_year = 2013 #change this when Social Security database is updated sex = 'M' # change this to 'F' to do same analysis on girls' names save_path = "user_last_letters_" + sex # files created by this notebook will be saved in this directory import os if not os.path.isdir(save_path): # creates path if it does not exist os.makedirs(save_path) import pandas as pd import numpy as np import matplotlib.pyplot as plt %run download_and_process.py # create df_half with only one sex # aggregate percentages of last letter into df_last df_half = yob[yob.sex == sex] df_half = df_half[['name', 'year', 'pct']] df_half.name = df_half.name.str.lower() #just in case any name ends in a capital letter pieces = [] for yr in range(1880, last_year + 1): loopdf = df_half[df_half.year == yr] last_letter = [] letter_count = [] for idx, row in loopdf.iterrows(): current_name = row['name'] current_length = len(current_name) current_pct = row['pct'] last_letter.append(current_name[current_length-1]) letter_count.append(current_pct) lettersdf = pd.DataFrame(last_letter) lettersdf.rename(columns = {0: 'letter'}, inplace=True) countsdf = pd.DataFrame(letter_count) countsdf.rename(columns = {0: 'pct'}, inplace=True) frame = lettersdf.join(countsdf) frame['year'] = yr pieces.append(frame) pieces_concat = pd.concat(pieces, ignore_index=True) df_last = pd.DataFrame(pieces_concat.groupby(['letter', 'year']).pct.sum()) df_last.reset_index(inplace=True, drop=False) print df_last.tail() #df_last_max shows most popular letter and its percentage every year df_last_max = df_last.groupby('year').apply(lambda t: t[t.pct==t.pct.max()]) df_last_max.reset_index(inplace=True, drop=True) max_overall = df_last_max.pct.max() print "Tail of 'df_last_max':" print df_last_max.tail() print "\nLast letters that were most popular in any given year:", last_list = list(df_last_max.letter.unique()) for ltr in last_list: print ltr, print "\nMaximum overall popularity of a last letter: %0.2f%%" % max_overall # Create grid of subplots of last letter frequency by year for each letter # this version has different scales for each y axis os.chdir(save_path) alphalist = list(df_last.groupby(['letter']).pct.mean().rank() / 26) alphabet = 'abcdefghijklmnopqrstuvwxyz' fig = plt.figure(figsize=(12, 7), dpi=150, facecolor='w', edgecolor='k') fig.subplots_adjust(hspace=.4) for i in range(len(alphabet)): curcol = i % 5 + 1 currow = (i / curcol) + 1 ltr = alphabet[i] subdf = df_last[df_last.letter == ltr] xlist = list(subdf.year) ylist = list(subdf.pct) plt.subplot(5, 6, i + 1) plt.xlabel('') plt.xlim(1880, last_year) plt.tick_params(axis='both', labelsize=0, length=0, width=0, color='#ffffff') plt.ylabel("") plt.title(ltr, size = 11) plt.plot(xlist,ylist, color='black') plt.savefig("grid_lastletters_unscaled.png") plt.show() os.chdir("../") # Same plot as above, but with every chart having the same maximum y scale os.chdir(save_path) alphalist = list(df_last.groupby(['letter']).pct.mean().rank() / 26) alphabet = 'abcdefghijklmnopqrstuvwxyz' fig = plt.figure(figsize=(12, 7), dpi=150, facecolor='w', edgecolor='k') fig.subplots_adjust(hspace=.4) for i in range(len(alphabet)): curcol = i % 5 + 1 currow = (i / curcol) + 1 ltr = alphabet[i] subdf = df_last[df_last.letter == ltr] xlist = list(subdf.year) ylist = list(subdf.pct) plt.subplot(5, 6, i + 1) plt.xlabel('') plt.xlim(1880, last_year) plt.ylim(0, max_overall) plt.tick_params(axis='both', labelsize=0, length=0, width=0, color='#ffffff') plt.ylabel("") plt.title(ltr, size = 11) plt.plot(xlist,ylist, color='black') plt.savefig("grid_lastletters_scaled.png") plt.show() os.chdir("../") # Some procedures needed for following histograms # dict of most common last letter each year last_by_year = {} for year in range(1880, last_year + 1): last_by_year[year] = df_last_max[df_last_max.year == year].letter.iloc[0] # make dict of maximum values for each letter # I used this in my blog to add fills to the by-letter plots above # fills can be done in matplotlib, of course; this was more expedient alphabet = 'abcdefghijklmnopqrstuvwxyz' last_max_dict = {} temppivot = pd.pivot_table(df_last, values='pct', rows='year', cols='letter') temppivot = temppivot.fillna(value=0) for ltr in alphabet: last_max_dict[ltr] = temppivot[ltr].max() # make transposed pivot table of years and letters with percent values # this fills in NaN values for letters which did not appear in database # at all for certain years; this NaNs are replaced with zeros dflastpivott = pd.pivot_table(df_last, values='pct', rows='letter', cols='year') dflastpivott = dflastpivott.fillna(value=0) # make dict of y positions for histogram letter labels y_dict = {} for year in range(1880, last_year + 1): ycalc = df_last_max[df_last_max.year == year].pct.iloc[0] ycalc = (round(ycalc/2,0)+1) * 2 if year == 1880: y_dict[year] = ycalc last_ycalc = ycalc else: lastlet = last_by_year[year - 1] thislet = last_by_year[year] if lastlet != thislet: y_dict[year] = ycalc last_ycalc = ycalc else: if ycalc < last_ycalc: ycalc = last_ycalc last_ycalc = ycalc y_dict[year] = ycalc # Creates histograms, one per year save_histograms = False # Change to true to save histogram as png grappics use_full_range = False ## Change to true to get histograms of the entire dataset; ## This was implemented so the GitHub/nbviewer version would not be too huge if use_full_range == False: start_year = 1943 ## Change if desired end_year = 2013 ## Change if desired skip_year = 10 ## Change if desired; at 10, histograms will be made one every decade else: start_year = 1880 end_year = last_year skip_year = 1 ## These values will print every year os.chdir(save_path) alphabet = 'abcdefghijklmnopqrstuvwxyz' alphadict = {} for i in range(26): alphadict[alphabet[i]] = i for year in range(start_year, end_year + 1, skip_year): # Change to a list of years to cut down on the number of histograms shown percentlist = list(dflastpivott[year]) maxlet = last_by_year[year] ind = np.arange(26) # the x locations for the groups width = 0.9 # the width of the bars fig = plt.figure(figsize=(10, 6), dpi=150, facecolor='w', edgecolor='k') ax = plt.subplot(111) barlist = plt.bar(range(26), percentlist, width=width, color='#aa4444') barlist[alphadict[maxlet]].set_color('#000088') ax.set_xticks(np.arange(26) + width/2) ax.set_xticklabels( ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') ) ax.set_ylabel('% of names ending with letter', size = 14) ax.set_title("Distribution of last letter in newborn boys' names", size=20, color="#222222") plt.annotate(year, xy=(.98, .96), xycoords='axes fraction', size = 32, color='#aa4444', horizontalalignment='right', verticalalignment='top') plt.annotate("Source: U.S. Social Security Administration", xy=(0.05, 0.03), xycoords='figure fraction', size = 10, horizontalalignment='left', verticalalignment='bottom') plt.annotate("prooffreader.com", xy=(0.95, 0.03), xycoords='figure fraction', size = 13, horizontalalignment='right', verticalalignment='bottom') plt.annotate(maxlet, xy=(alphadict[maxlet]+0.45, y_dict[year]-0.5), xycoords='data', size='19', color = "#000088", horizontalalignment='center', verticalalignment='bottom') plt.ylim(0, 40) plt.xlim(0, 26) ax.xaxis.set_tick_params(width=0) if save_histograms == True: plt.savefig("histogram_last_letter_%s_%d.png" % (sex, year)) plt.show() plt.close() os.chdir("../") last_year = 2013 #change this when Social Security database is updated sex = 'M' # change this to 'F' to do same analysis on girls' names save_path = "last_letters_" + sex # files created by this notebook will be saved in this directory import os if not os.path.isdir(save_path): # creates path if it does not exist os.makedirs(save_path) import pandas as pd import numpy as np import matplotlib.pyplot as plt %run download_and_process.py # create df_quint dataframe df_quint = yob[yob.sex == 'M'] df_quint.sort(columns=['year', 'births'], ascending=[True, False], inplace=True) df_quint['cumul_sum'] = df_quint.groupby('year').pct.cumsum() df_quint['quintile'] = 0 df_quint['endsn'] = 0 for idx, row in df_quint.iterrows(): if df_quint['name'][idx][-1] == 'n': df_quint['endsn'][idx] = 1 df_quint['quintile'][idx] = (df_quint['cumul_sum'][idx] / 20) + 1 if df_quint['quintile'][idx] == 6: df_quint['quintile'][idx] = 5 # quick inelegant fix to fencepost error # note that if a name straddles a quintile, it is put in the lower quintile # one could distribute such names between quintiles, but doing so does not add anything substantive to the analysis print "Tail of df_quint:" print df_quint.tail() # create dicts for graphs by_year_n = {} # dict with key of year, quintile tuple and total percentage of names ending with n by_year_notn = {} # same as above but for names not ending in n by_year_propn = {} # same as above but proportion ending in n for i in range(1880, last_year + 1): for q in range(1,6): by_year_n[(i, q)] = 0 by_year_notn[(i, q)] = 0 for idx, row in df_quint.iterrows(): if df_quint['endsn'][idx] == 1: by_year_n[(df_quint['year'][idx],df_quint['quintile'][idx])] += df_quint['births'][idx] else: by_year_notn[(df_quint['year'][idx],df_quint['quintile'][idx])] += df_quint['births'][idx] for i in range(1880, last_year + 1): for q in range(1,6): by_year_propn[(i, q)] = 1.0 * by_year_n[(i, q)] / (by_year_n[(i, q)] + by_year_notn[(i, q)]) overall_n = {} overall_notn = {} for i in range(1880, last_year + 1): overall_n[i] = 0 overall_notn[i] = 0 for idx, row in df_quint.iterrows(): if df_quint['endsn'][idx] == 1: overall_n[df_quint['year'][idx]] += df_quint['births'][idx] else: overall_notn[df_quint['year'][idx]] += df_quint['births'][idx] overall_propn = {} for i in range(1880, last_year + 1): overall_propn[i] = 1.0 * overall_n[i] / (overall_n[i] + overall_notn[i]) # import seaborn # Uncomment if you have seaborn installed and want nicer-looking graphs # note that much of the graphics processing for the graphs shown on prooffreader.com # was done after the fact in Photoshop # the top five graphs are the five quintiles and the bottom is the overall frequency of names ending in n os.chdir(save_path) fig = plt.figure(figsize=(12, 12), dpi=150, facecolor='w', edgecolor='k') #fig.subplots_adjust(hspace=.4) for q in range(1, 6): listx = [] listy = [] for yr in range(1880, last_year + 1): listx.append(yr) listy.append(by_year_propn[(yr, q)]) plt.subplot(6, 1, q) plt.xlabel('') plt.xlim(1880, last_year) plt.ylim(0, 1) plt.ylabel("") plt.title(' ', size = 11) plt.plot(listx,listy, color='black') listx = [] listy = [] for yr in range(1880, last_year + 1): listx.append(yr) listy.append(overall_propn[yr]) plt.subplot(6, 1, 6) plt.xlabel('') plt.xlim(1880, last_year) plt.ylim(0, 1) plt.ylabel("") plt.title(' ', size = 11) plt.plot(listx,listy, color='black') plt.savefig("quintiles_n_" + sex + ".png") plt.show() os.chdir("../") df_quint.to_pickle("last_letters_" + sex + "/df_quint_" + sex + ".pickle")