save_path = "user_trendy" # files created by this notebook will be saved in this directory
import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
os.makedirs(save_path)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%run download_and_process.py
last_year = years.year.max()
Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names 128 2008 1886765 2035811 3922576 2046 32483 129 2009 1832276 1978582 3810858 1789 32210 130 2010 1771846 1912915 3684761 1635 31593 131 2011 1752198 1891800 3643998 1539 31412 132 2012 1751866 1886972 3638838 1531 31212
Low-popularity names are not included because their low signal-to-noise ratio would create false positives
# create dataframes of most popular names
top_cutoff = 1000 #consider only this number of most popular names of each sex;
#it saves calculation time and does not change final result
peak_height_cutoff = 0.1 # ten percent of peak height
###
start = time.time()
dfnamesm = names[names.sex == 'M']
dfnamesf = names[names.sex == 'F']
dfnamesm = dfnamesm.sort('pct_sum', ascending=False)
dfnamesf = dfnamesf.sort('pct_sum', ascending=False)
dfnamesm = dfnamesm[:top_cutoff]
dfnamesf = dfnamesf[:top_cutoff]
# create dataframe of peak analyses
dfresult = pd.DataFrame()
print "Countdown F, then M to zero: ",
sx = 'F'
print sx,
total = len(dfnamesf)
for nm in list(dfnamesf.name):
total -= 1
if total % 100 == 0: print total,
df = yob[(yob.name == nm) & (yob.sex == sx)]
pct_sum = names[(names.name == nm) & (names.sex == sx)].pct_sum.iloc[0]
year_count = names[(names.name == nm) & (names.sex == sx)].year_count.iloc[0]
year_min = names[(names.name == nm) & (names.sex == sx)].year_min.iloc[0]
year_max = names[(names.name == nm) & (names.sex == sx)].year_max.iloc[0]
pct_max = df.pct.max()
df = df.sort('year')
yrcutstart = 0
yrcutend= 0
pctcut_sum = 0
for idx, row in df.iterrows():
currpct =df.pct[idx]
curryr = df.year[idx]
if currpct >= peak_height_cutoff*pct_max:
pctcut_sum += currpct
if yrcutstart == 0:
yrcutstart = curryr
yrcutend = curryr
tail_front = yrcutstart - year_min
tail_end = year_max - yrcutend
yrcutspan = yrcutend-yrcutstart
yrcutratio = 1.0*yrcutspan/year_count
spikiness = pct_max / yrcutspan
dfresult = dfresult.append(pd.DataFrame({'name':[nm],
'sex':[sx],
'year_count':[year_count],
'year_min':[year_min],
'year_max':[year_max],
'pct_max':[pct_max],
'pct_sum':[pct_sum],
'yrcutstart':[yrcutstart],
'yrcutend':[yrcutend],
'yrcutspan':[yrcutspan],
'yrcutratio':[yrcutratio],
'pctcut_sum':[pctcut_sum],
'tail_front':[tail_front],
'tail_end':[tail_end],
'spikiness':[spikiness] }))
sx = 'M'
print sx,
total = len(dfnamesm)
for nm in list(dfnamesm.name):
total -= 1
if total % 100 == 0: print total,
df = yob[(yob.name == nm) & (yob.sex == sx)]
pct_sum = names[(names.name == nm) & (names.sex == sx)].pct_sum.iloc[0]
year_count = names[(names.name == nm) & (names.sex == sx)].year_count.iloc[0]
year_min = names[(names.name == nm) & (names.sex == sx)].year_min.iloc[0]
year_max = names[(names.name == nm) & (names.sex == sx)].year_max.iloc[0]
pct_max = df.pct.max()
df = df.sort('year')
yrcutstart = 0
yrcutend= 0
pctcut_sum = 0
for idx, row in df.iterrows():
currpct =df.pct[idx]
curryr = df.year[idx]
if currpct >= peak_height_cutoff*pct_max:
pctcut_sum += currpct
if yrcutstart == 0:
yrcutstart = curryr
yrcutend = curryr
tail_front = yrcutstart - year_min
tail_end = year_max - yrcutend
yrcutspan = yrcutend-yrcutstart
yrcutratio = 1.0*yrcutspan/year_count
spikiness = pct_max / yrcutspan
dfresult = dfresult.append(pd.DataFrame({'name':[nm],
'sex':[sx],
'year_count':[year_count],
'year_min':[year_min],
'year_max':[year_max],
'pct_max':[pct_max],
'pct_sum':[pct_sum],
'yrcutstart':[yrcutstart],
'yrcutend':[yrcutend],
'yrcutspan':[yrcutspan],
'yrcutratio':[yrcutratio],
'pctcut_sum':[pctcut_sum],
'tail_front':[tail_front],
'tail_end':[tail_end],
'spikiness':[spikiness] }))
picklepath = save_path + 'trendiness_'+ str(int(100*peak_height_cutoff))+'.pickle'
csvpath = save_path + 'trendiness_'+ str(int(100*peak_height_cutoff))+'.csv'
dfresult = dfresult[(dfresult.tail_end != 0 ) & (dfresult.tail_front != 0)] # remove names that have no ascent or descent after peak
df.reset_index(drop=True, inplace=True)
dfresult.to_pickle(picklepath)
dfresult.to_csv(csvpath)
print '\nFiles saved.'
Countdown F, then M to zero: F 900 800 700 600 500 400 300 200 100 0 M 900 800 700 600 500 400 300 200 100 0 Files saved.
print dfresult[dfresult.sex == 'F'].sort('spikiness', ascending=False).reset_index().head(10)
index name pct_max pct_sum pctcut_sum sex spikiness tail_end \ 0 0 Linda 5.666385 86.481786 78.126372 F 0.182787 44 1 0 Brittany 2.050094 19.855254 18.405681 F 0.120594 13 2 0 Debra 2.585195 29.131212 26.781168 F 0.117509 41 3 0 Shirley 4.040080 53.493663 46.610664 F 0.112224 56 4 0 Ashley 3.155460 46.890487 44.550597 F 0.105182 3 5 0 Jennifer 4.300706 88.233240 82.486767 F 0.104895 11 6 0 Deborah 2.816031 40.361956 36.586290 F 0.104297 39 7 0 Lisa 3.414439 55.271708 52.217140 F 0.100425 24 8 0 Jessica 3.221578 60.395536 57.361646 F 0.092045 7 9 0 Betty 3.396793 84.919816 76.302278 F 0.069322 54 tail_front year_count year_max year_min yrcutend yrcutratio \ 0 58 134 2013 1880 1969 0.231343 1 20 51 2013 1963 2000 0.333333 2 36 99 2013 1914 1972 0.222222 3 41 133 2013 1880 1957 0.270677 4 63 75 2013 1917 2010 0.400000 5 45 96 2013 1916 2002 0.427083 6 67 134 2013 1880 1974 0.201493 7 69 110 2013 1886 1989 0.309091 8 91 134 2013 1880 2006 0.261194 9 30 134 2013 1880 1959 0.365672 yrcutspan yrcutstart 0 31 1938 1 17 1983 2 22 1950 3 36 1921 4 30 1980 5 41 1961 6 27 1947 7 34 1955 8 35 1971 9 49 1910
print dfresult[dfresult.sex == 'M'].sort('spikiness', ascending=False).reset_index().head(10)
index name pct_max pct_sum pctcut_sum sex spikiness tail_end \ 0 0 Dewey 0.908765 5.132570 2.104882 M 0.151461 110 1 0 Jason 3.482095 58.020317 54.847647 M 0.084929 4 2 0 Grover 0.712424 6.548641 3.900972 M 0.059369 118 3 0 Mark 2.754216 73.705981 65.994464 M 0.051966 14 4 0 Woodrow 0.455177 4.001271 2.711625 M 0.041380 91 5 0 Gary 2.026460 51.723374 48.724457 M 0.038970 28 6 0 Brian 2.290509 63.510556 61.009071 M 0.038822 4 7 0 Larry 1.909608 49.535771 44.995968 M 0.037443 31 8 0 Scott 1.748081 42.778283 39.141361 M 0.037193 16 9 0 Donald 2.945867 106.675879 100.563276 M 0.036369 30 tail_front year_count year_max year_min yrcutend yrcutratio \ 0 10 127 2013 1887 1903 0.047244 1 88 134 2013 1880 2009 0.305970 2 3 134 2013 1880 1895 0.089552 3 66 134 2013 1880 1999 0.395522 4 11 110 2013 1900 1922 0.100000 5 53 132 2013 1880 1985 0.393939 6 50 105 2013 1900 2009 0.561905 7 51 134 2013 1880 1982 0.380597 8 70 134 2013 1880 1997 0.350746 9 22 134 2013 1880 1983 0.604478 yrcutspan yrcutstart 0 6 1897 1 41 1968 2 12 1883 3 53 1946 4 11 1911 5 52 1933 6 59 1950 7 51 1931 8 47 1950 9 81 1902
yob[(yob.name=='Linda')&(yob.sex=='F')].sort('year')['pct'].plot()
<matplotlib.axes.AxesSubplot at 0xe591240>
yob[(yob.name=='Dewey')&(yob.sex=='M')].sort('year')['pct'].plot()
<matplotlib.axes.AxesSubplot at 0xe584860>