Only Greek, Egyptian, Norse and Roman names are used, because there was too much confusion with others, e.g. Ora was a common girls' name with a Latin origin, but coincidentally was also a figure in Balto-Slavic mythology.
listed_path = "lists/mythological_names_eg_gk_ro_no.list"
totals_title = "Mythological names in U.S. Social Security baby names database, 1880-2013"
top_cutoff = 6
top_boys_title = "Top %d mythological boys' names from U.S. Social Security database, 1880-2013" % (top_cutoff)
top_girls_title = "Top %d mythological girls' names from U.S. Social Security database, 1880-2013" % (top_cutoff)
last_year = 2013 #change this when Social Security database is updated
save_path = "user_charts" # files created by this notebook will be saved in this directory
import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
os.makedirs(save_path)
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn # comment out if you don't have it, but it makes good-looking charts
print 'This is standard output from download_and_process.py'
%run download_and_process.py
print '--------------------\nFirst 80 characters of list:'
listed_file = open(listed_path, "r").read()
print listed_file[:80] + ' ...'
all_listed = eval(listed_file) # make sure you trust this file!
all_listed_set = set(all_listed) # to remove duplicates
all_listed = list(all_listed)
print "all_listed: list of length", len(all_listed)
# reduce names dataframe to those matching list
print '--------------------\nDataframe names filtered to those that match list'
print "%d records to begin." % (len(names))
names_listed = names[names.name.isin(all_listed)].copy()
names_listed.sort('pct_max', ascending=False, inplace=True)
print "%d records remaining." % (len(names_listed))
listed_in_df = list(names_listed.name)
print names_listed.head(10)
listed_m = list(names[(names.sex == 'M') & (names.name.isin(listed_in_df))]['name'])
listed_f = list(names[(names.sex == 'F') & (names.name.isin(listed_in_df))]['name'])
#reduce yob dataframe to those matching list
print '--------------------\nDataframe yob filtered to those that match list (count only)'
print "%d records to begin." % (len(yob))
yob_listed = yob[yob.name.isin(listed_in_df)].copy()
yob_listed.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
print "%d records remaining." % (len(yob_listed))
# m and f totals
yob_listed_f_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'F'].groupby('year').sum())[['births', 'pct']]
yob_listed_m_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'M'].groupby('year').sum())[['births', 'pct']]
print '--------------------\nHead of total matching list per year, female'
print yob_listed_f_agg.head()
# print chart of m and f totals
print '\n'
# function to determine a nice y-axis limit a little above the maximum value
# rounds maximum y up to second-most-significant digit
def determine_y_limit(x):
significance = int(math.floor((math.log10(x))))
val = math.floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
#data
xf = list(yob_listed_f_agg.index)
xm = list(yob_listed_m_agg.index)
plt.figure(figsize=(16,9))
plt.plot(xf, list(yob_listed_f_agg.pct), color="red")
plt.plot(xm, list(yob_listed_m_agg.pct), color="blue")
plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct)
+list(yob_listed_m_agg.pct))))
plt.xlim(1880, 2013)
plt.title(totals_title, fontsize = 20)
plt.xlabel("Year", fontsize = 14)
plt.ylabel("% of total births of that sex", fontsize = 14)
plt.show()
#function to make dataframe for top names
def top_df(yobdf, names, sexes):
""" yobdf = dataframe derived from yob; normally it would just be yob itself.
names = list of names
sexes = list of length 1 for all the same sex, or same length as names. 'F' and 'M' allowed
"""
df_chart = yobdf.copy()
assert len(sexes) == 1 or len(names) == len(sexes)
if len(sexes) == 1:
sexes = sexes * len(names)
df_chart = df_chart[df_chart['name'].isin(names)]
df_chart['temp'] = 0
for row in range(len(df_chart)):
for pos in range(len(names)):
if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
print "Tail of dataframe:"
print df_chart.tail()
output_df = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = output_df.columns[0]
for yr in range(1880, last_year + 1): #inserts missing years
if yr not in output_df.index:
#output_df[col][yr] = 0.0
output_df = output_df.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
output_df = output_df.fillna(0)
return output_df
listed_top_m = top_df(yob, listed_m[:top_cutoff], ['M'])
listed_top_f = top_df(yob, listed_f[:top_cutoff], ['F'])
#a single function to make the four different kinds of charts
def make_chart(df, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ['#BB2114', '#0C5966', '#BA7814', '#4459AB', '#6B3838',
'#B8327B', '#2B947F', '#0D83B5', '#684287', '#8C962C',
'#92289E', '#242D7D']
# my own list of dark contrasting colors
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(startyear, endyear)
ax.set_ylabel(y_text, size = 13)
ax.set_title(title, size = 18)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(startyear, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
# line charts
make_chart(df=listed_top_m,
form='line', # line , subplots_auto , subplots_same , stream
title=top_boys_title,
colors= [],
smoothing=0,
baseline='zero', # zero , sym , wiggle , weighted_wiggle
)
make_chart(df=listed_top_f,
form='line', # line , subplots_auto , subplots_same , stream
title=top_girls_title,
colors= [],
smoothing=0,
baseline='zero', # zero , sym , wiggle , weighted_wiggle
)
names_listed.reset_index(drop = True, inplace = True)
names_listed.head()
names_listed.to_csv('lists/names_matching_mythological_list.csv')
This is standard output from download_and_process.py Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names_x sexratio \ 68 2008 1886765 2035811 3922576 2046 32483 107.899553 69 2009 1832276 1978582 3810858 1789 32210 107.984932 70 2010 1771846 1912915 3684761 1635 31593 107.961696 71 2011 1752198 1891800 3643998 1539 31412 107.967250 72 2012 1751866 1886972 3638838 1531 31212 107.712120 unique_names_y_x unique_names_x unique_names_y_x unique_names_x \ 68 32483 32483 32483 32483 69 32210 32210 32210 32210 70 31593 31593 31593 31593 71 31412 31412 31412 31412 72 31212 31212 31212 31212 unique_names_y_x unique_names_x unique_names_y_x unique_names_x \ 68 32483 32483 32483 32483 69 32210 32210 32210 32210 70 31593 31593 31593 31593 71 31412 31412 31412 31412 72 31212 31212 31212 31212 unique_names_y_x unique_names_x unique_names_y 68 32483 32483 32483 69 32210 32210 32210 70 31593 31593 31593 71 31412 31412 31412 72 31212 31212 31212 -------------------- First 80 characters of list: ['Athena', 'Amaunet', 'Amen', 'Amon', 'Amun', 'Anat', 'Anqet', 'Antaios', 'Anubi ... all_listed: list of length 701 -------------------- Dataframe names filtered to those that match list 102690 records to begin. 134 records remaining. name sex year_count year_min year_max pct_sum pct_max 91 Doris F 134 1880 2013 41.513522 1.477798 431 Sophia F 134 1880 2013 18.491332 1.269789 258 Chloe F 134 1880 2013 10.271245 0.662304 264 Diana F 134 1880 2013 21.019023 0.480095 294 Flora F 134 1880 2013 12.321405 0.401130 64555 Seth M 134 1880 2013 8.829215 0.342210 13211 Khloe F 25 1989 2013 1.430133 0.304485 9 Delia F 134 1880 2013 4.716709 0.144228 800 Luna F 121 1880 2013 1.066362 0.099330 64817 Griffin M 125 1881 2013 1.782372 0.087336 -------------------- Dataframe yob filtered to those that match list (count only) 1792091 records to begin. 5045 records remaining. -------------------- Head of total matching list per year, female births pct year 1880 824 0.905564 1881 718 0.780825 1882 910 0.843764 1883 920 0.819074 1884 999 0.774287
Tail of dataframe: name sex births year pct ranked temp 1778363 Seth M 1578 2013 0.084319 231.0 1 1779752 Thor M 94 2013 0.005023 1617.5 1 1781010 Amon M 39 2013 0.002084 2898.0 1 1781160 Sol M 37 2013 0.001977 3008.0 1 1789275 Hercules M 6 2013 0.000321 11332.0 1 Tail of dataframe: name sex births year pct ranked temp 1759288 Diana F 1171 2013 0.067429 270.0 1 1759319 Phoebe F 1050 2013 0.060462 301.0 1 1760337 Delia F 176 2013 0.010135 1319.5 1 1761290 Doris F 81 2013 0.004664 2275.0 1 1762435 Minerva F 46 2013 0.002649 3408.0 1
print names_listed.name.unique()
['Doris' 'Sophia' 'Chloe' 'Diana' 'Flora' 'Seth' 'Khloe' 'Delia' 'Luna' 'Griffin' 'Athena' 'Minerva' 'Phoebe' 'Vesta' 'Sol' 'Phoenix' 'Thalia' 'Isis' 'Rhea' 'Odin' 'Eris' 'Venus' 'Ares' 'Apollo' 'Persephone' 'Gerda' 'Amon' 'Thor' 'Lucina' 'Osiris' 'Shai' 'Ran' 'Nanna' 'Loki' 'Zeus' 'Freyja' 'Melaina' 'Gaia' 'Pelagia' 'Sia' 'Hercules' 'Artemis' 'Juno' 'Lousia' 'Lamia' 'Clio' 'Urania' 'Amen' 'Andromeda' 'Chloris' 'Hera' 'Ladon' 'Valkyrie' 'Athene' 'Mars' 'Aphrodite' 'Deianeira' 'Helios' 'Min' 'Clete' 'Areion' 'Nike' 'Cybele' 'Hermes' 'Fortuna' 'Caliope' 'Ourania' 'Poseidon' 'Janus' 'Mercury' 'Tyr' 'Areia' 'Chimera' 'Ceres' 'Anat' 'Holle' 'Anubis' 'Vali' 'Ra' 'Pallas' 'Kore' 'Demeter' 'Shu' 'Jupiter' 'Soteria' 'Makar' 'Amun' 'Maat' 'Lakinia' 'Pater' 'Agathe' 'Saturn' 'Tyche' 'Khepri' 'Aten' 'Set' 'Fenris' 'Horus' 'Megale']
Some of these names are only coincidentally mythological, e.g. Seth is an Egyptian god's name, but a common Hebrew name, and Doris is a very minor mythological figure, so probably few parents were even aware of the connection (same with Phoebe, Chlore ... Diana is a tougher call, but I think parents are more likely to not have used the name because of a mythological association). Obviously, it's impossible to reading parents' minds with data abstracted like this, so the only choice is to manually curate names that most obviously come from mythology.
cutoffn = 0
# how many names will remain to evaluate after duplicates removed
from collections import OrderedDict
evallistm = OrderedDict()
evallistf = OrderedDict()
# remove names with more common duplicates in other sex
# this happens frequently in ssa db
for name in listed_m:
try:
pctf = names_listed[(names_listed.sex == 'F') &
(names_listed.name == name)].pct_max.iloc[0]
pctm = names_listed[(names_listed.sex == 'M') &
(names_listed.name == name)].pct_max.iloc[0]
except:
pctf = 98
pctm = 99
if (name not in names_listed[names_listed.sex == 'F'].name.unique() or
pctf < pctm):
evallistm[name] = ''
for name in listed_f:
try:
pctf = names_listed[(names_listed.sex == 'F') &
(names_listed.name == name)].pct_max.iloc[0]
pctm = names_listed[(names_listed.sex == 'M') &
(names_listed.name == name)].pct_max.iloc[0]
except:
pctf = 99
pctm = 98
if (name not in names_listed[names_listed.sex == 'M'].name.unique() or
pctm < pctf):
evallistf[name] = ''
if cutoffn > 0:
assert len(evallistm) > cutoffn
assert len(evallistf) > cutoffn
print evallistm[:cutoffn]
print evallistf[:cutoffn]
else:
print 'Length of lists: %d male, %d female\n' % (len(evallistm), len(evallistf))
print evallistm
print ' '
print evallistf
Length of lists: 38 male, 61 female OrderedDict([('Sol', ''), ('Seth', ''), ('Griffin', ''), ('Amon', ''), ('Thor', ''), ('Hercules', ''), ('Ladon', ''), ('Odin', ''), ('Hermes', ''), ('Apollo', ''), ('Osiris', ''), ('Min', ''), ('Clete', ''), ('Zeus', ''), ('Phoenix', ''), ('Amen', ''), ('Mars', ''), ('Ares', ''), ('Loki', ''), ('Nike', ''), ('Ran', ''), ('Mercury', ''), ('Tyr', ''), ('Jupiter', ''), ('Kore', ''), ('Ra', ''), ('Anubis', ''), ('Helios', ''), ('Poseidon', ''), ('Makar', ''), ('Pater', ''), ('Amun', ''), ('Fenris', ''), ('Set', ''), ('Demeter', ''), ('Horus', ''), ('Megale', ''), ('Aten', '')]) OrderedDict([('Delia', ''), ('Minerva', ''), ('Doris', ''), ('Phoebe', ''), ('Chloe', ''), ('Diana', ''), ('Flora', ''), ('Sophia', ''), ('Rhea', ''), ('Venus', ''), ('Vesta', ''), ('Luna', ''), ('Thalia', ''), ('Lucina', ''), ('Athena', ''), ('Gerda', ''), ('Eris', ''), ('Artemis', ''), ('Aphrodite', ''), ('Isis', ''), ('Clio', ''), ('Persephone', ''), ('Melaina', ''), ('Shai', ''), ('Andromeda', ''), ('Lamia', ''), ('Sia', ''), ('Hera', ''), ('Nanna', ''), ('Urania', ''), ('Gaia', ''), ('Khloe', ''), ('Chloris', ''), ('Athene', ''), ('Janus', ''), ('Freyja', ''), ('Valkyrie', ''), ('Ourania', ''), ('Juno', ''), ('Vali', ''), ('Holle', ''), ('Cybele', ''), ('Pelagia', ''), ('Anat', ''), ('Soteria', ''), ('Pallas', ''), ('Fortuna', ''), ('Maat', ''), ('Caliope', ''), ('Chimera', ''), ('Deianeira', ''), ('Agathe', ''), ('Lousia', ''), ('Shu', ''), ('Areion', ''), ('Ceres', ''), ('Areia', ''), ('Saturn', ''), ('Lakinia', ''), ('Tyche', ''), ('Khepri', '')])
#manually copy and paste the above lists and assign
#'acc' or 'rej' individually to accept or reject
evallistm = OrderedDict([('Sol', 'rej'), ('Seth', 'rej'), ('Griffin', 'rej'), ('Amon', 'rej'),
('Thor', 'acc'), ('Hercules', 'acc'), ('Ladon', 'rej'), ('Odin', 'acc'),
('Hermes', 'acc'), ('Apollo', 'acc'), ('Osiris', 'acc'), ('Min', 'rej'),
('Clete', 'rej'), ('Zeus', 'acc'), ('Phoenix', 'acc'), ('Amen', 'rej'),
('Mars', 'acc'), ('Ares', 'acc'), ('Loki', 'acc'), ('Nike', 'rej'),
('Ran', 'rej'), ('Mercury', 'acc'), ('Tyr', 'acc'), ('Jupiter', 'acc'),
('Kore', 'rej'), ('Ra', 'acc'), ('Anubis', 'acc'), ('Helios', 'acc'),
('Poseidon', 'acc'), ('Makar', 'rej'), ('Pater', 'rej'), ('Amun', 'rej'),
('Fenris', 'acc'), ('Set', 'rej'), ('Demeter', 'rej'), ('Horus', 'acc'),
('Megale', 'rej'), ('Aten', 'acc'), ('Saturn', 'acc')])
evallistf = OrderedDict([('Athena', 'acc'), ('Delia', 'rej'), ('Minerva', 'acc'), ('Doris', 'rej'), ('Phoebe', 'rej'),
('Chloe', 'rej'), ('Diana', 'rej'), ('Flora', 'rej'), ('Sophia', 'rej'),
('Rhea', 'rej'), ('Venus', 'acc'), ('Vesta', 'acc'), ('Luna', 'rej'),
('Thalia', 'acc'), ('Lucina', 'rej'), ('Gerda', 'rej'), ('Eris', 'acc'),
('Artemis', 'acc'), ('Aphrodite', 'acc'), ('Isis', 'acc'), ('Clio', 'acc'),
('Persephone', 'acc'), ('Melaina', 'rej'), ('Shai', 'rej'), ('Andromeda', 'acc'),
('Lamia', 'acc'), ('Sia', 'rej'), ('Hera', 'acc'), ('Nanna', 'rej'), ('Urania', 'acc'),
('Gaia', 'acc'), ('Khloe', 'rej'), ('Chloris', 'rej'), ('Athene', 'acc'),
('Janus', 'rej'), ('Freyja', 'acc'), ('Valkyrie', 'acc'), ('Ourania', 'acc'),
('Juno', 'acc'), ('Vali', 'acc'), ('Holle', 'rej'), ('Cybele', 'acc'),
('Pelagia', 'rej'), ('Anat', 'rej'), ('Soteria', 'rej'), ('Pallas', 'acc'),
('Fortuna', 'rej'), ('Maat', 'acc'), ('Caliope', 'acc'), ('Chimera', 'acc'),
('Deianeira', 'rej'), ('Agathe', 'rej'), ('Lousia', 'rej'), ('Shu', 'rej'),
('Areion', 'rej'), ('Ceres', 'acc'), ('Areia', 'rej'), ('Saturn', 'rej'),
('Lakinia', 'rej'), ('Tyche', 'acc'), ('Khepri', 'acc'), ('Demeter', 'acc'),
('Nike', 'acc')])
# Note, Demeter and Nike taken from males' list and moved to females'; for some reason it spiked in males higher than in females
# Similarly, Saturn moved from females' to males'
# Test that all names have 'acc' or 'rej' values
final_m = []
final_f = []
names_not_validated = []
for item in evallistm:
if evallistm[item] not in ['acc', 'rej']:
names_not_validated.append(item)
elif evallistm[item] == 'acc':
final_m.append(item)
for item in evallistf:
if evallistf[item] not in ['acc', 'rej']:
names_not_validated.append(item)
elif evallistf[item] == 'acc':
final_f.append(item)
final_all = final_m + final_f
if len(names_not_validated) > 0:
print "The following names do not have 'acc' or 'rej' values: ", names_not_validated
raise exception("Names not validated")
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f
print 'Length: %d male, %d female\n' % (len(final_m), len(final_f))
cutmin = min(len(final_m), len(final_f))
final_m = final_m[:cutmin]
final_f = final_f[:cutmin]
print 'After resizing to %d names each:' % (cutmin)
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f
Accepted male names: ['Thor', 'Hercules', 'Odin', 'Hermes', 'Apollo', 'Osiris', 'Zeus', 'Phoenix', 'Mars', 'Ares', 'Loki', 'Mercury', 'Tyr', 'Jupiter', 'Ra', 'Anubis', 'Helios', 'Poseidon', 'Fenris', 'Horus', 'Aten', 'Saturn'] Accepted female names: ['Athena', 'Minerva', 'Venus', 'Vesta', 'Thalia', 'Eris', 'Artemis', 'Aphrodite', 'Isis', 'Clio', 'Persephone', 'Andromeda', 'Lamia', 'Hera', 'Urania', 'Gaia', 'Athene', 'Freyja', 'Valkyrie', 'Ourania', 'Juno', 'Vali', 'Cybele', 'Pallas', 'Maat', 'Caliope', 'Chimera', 'Ceres', 'Tyche', 'Khepri', 'Demeter', 'Nike'] Length: 22 male, 32 female After resizing to 22 names each: Accepted male names: ['Thor', 'Hercules', 'Odin', 'Hermes', 'Apollo', 'Osiris', 'Zeus', 'Phoenix', 'Mars', 'Ares', 'Loki', 'Mercury', 'Tyr', 'Jupiter', 'Ra', 'Anubis', 'Helios', 'Poseidon', 'Fenris', 'Horus', 'Aten', 'Saturn'] Accepted female names: ['Athena', 'Minerva', 'Venus', 'Vesta', 'Thalia', 'Eris', 'Artemis', 'Aphrodite', 'Isis', 'Clio', 'Persephone', 'Andromeda', 'Lamia', 'Hera', 'Urania', 'Gaia', 'Athene', 'Freyja', 'Valkyrie', 'Ourania', 'Juno', 'Vali']
from copy import deepcopy
oldm = deepcopy(evallistm)
oldf = deepcopy(evallistf)
cutoffn = 0
# how many names will remain to evaluate after duplicates removed
from collections import OrderedDict
evallistm = OrderedDict()
evallistf = OrderedDict()
# remove names with more common duplicates in other sex
# this happens frequently in ssa db
for name in listed_m:
try:
pctf = names_listed[(names_listed.sex == 'F') &
(names_listed.name == name)].pct_max.iloc[0]
pctm = names_listed[(names_listed.sex == 'M') &
(names_listed.name == name)].pct_max.iloc[0]
except:
pctf = 98
pctm = 99
if (name not in ['Demeter', 'Nike'] and (name not in names_listed[names_listed.sex == 'F'].name.unique() or
pctf < pctm or name == 'Saturn')):
evallistm[name] = ''
for name in listed_f:
try:
pctf = names_listed[(names_listed.sex == 'F') &
(names_listed.name == name)].pct_max.iloc[0]
pctm = names_listed[(names_listed.sex == 'M') &
(names_listed.name == name)].pct_max.iloc[0]
except:
pctf = 99
pctm = 98
if (name != 'Saturn' and (name not in names_listed[names_listed.sex == 'M'].name.unique() or
pctm < pctf or name in ['Demeter', 'Nike'])):
evallistf[name] = ''
for item in evallistm: # copy from above block
try:
evallistm[item] = oldm[item]
except:
pass
for item in evallistf:
try:
evallistf[item] = oldf[item]
except:
pass
if cutoffn > 0:
assert len(evallistm) > cutoffn
assert len(evallistf) > cutoffn
print evallistm[:cutoffn]
print evallistf[:cutoffn]
else:
print 'Length of lists: %d male, %d female\n' % (len(evallistm), len(evallistf))
print evallistm
print ' '
print evallistf
Length of lists: 36 male, 61 female OrderedDict([('Sol', 'rej'), ('Seth', 'rej'), ('Griffin', 'rej'), ('Amon', 'rej'), ('Thor', 'acc'), ('Hercules', 'acc'), ('Ladon', 'rej'), ('Odin', 'acc'), ('Hermes', 'acc'), ('Apollo', 'acc'), ('Osiris', 'acc'), ('Min', 'rej'), ('Clete', 'rej'), ('Zeus', 'acc'), ('Phoenix', 'acc'), ('Amen', 'rej'), ('Mars', 'acc'), ('Ares', 'acc'), ('Loki', 'acc'), ('Ran', 'rej'), ('Mercury', 'acc'), ('Tyr', 'acc'), ('Jupiter', 'acc'), ('Kore', 'rej'), ('Ra', 'acc'), ('Anubis', 'acc'), ('Helios', 'acc'), ('Poseidon', 'acc'), ('Makar', 'rej'), ('Pater', 'rej'), ('Amun', 'rej'), ('Fenris', 'acc'), ('Set', 'rej'), ('Horus', 'acc'), ('Megale', 'rej'), ('Aten', 'acc')]) OrderedDict([('Delia', 'rej'), ('Minerva', 'acc'), ('Doris', 'rej'), ('Phoebe', 'rej'), ('Chloe', 'rej'), ('Diana', 'rej'), ('Flora', 'rej'), ('Sophia', 'rej'), ('Rhea', 'rej'), ('Venus', 'acc'), ('Vesta', 'acc'), ('Luna', 'rej'), ('Thalia', 'acc'), ('Lucina', 'rej'), ('Athena', 'acc'), ('Gerda', 'rej'), ('Eris', 'acc'), ('Artemis', 'acc'), ('Aphrodite', 'acc'), ('Isis', 'acc'), ('Clio', 'acc'), ('Persephone', 'acc'), ('Melaina', 'rej'), ('Shai', 'rej'), ('Andromeda', 'acc'), ('Lamia', 'acc'), ('Sia', 'rej'), ('Hera', 'acc'), ('Nanna', 'rej'), ('Urania', 'acc'), ('Gaia', 'acc'), ('Khloe', 'rej'), ('Chloris', 'rej'), ('Athene', 'acc'), ('Nike', 'acc'), ('Janus', 'rej'), ('Freyja', 'acc'), ('Valkyrie', 'acc'), ('Ourania', 'acc'), ('Juno', 'acc'), ('Vali', 'acc'), ('Holle', 'rej'), ('Cybele', 'acc'), ('Pelagia', 'rej'), ('Anat', 'rej'), ('Soteria', 'rej'), ('Pallas', 'acc'), ('Fortuna', 'rej'), ('Maat', 'acc'), ('Caliope', 'acc'), ('Chimera', 'acc'), ('Deianeira', 'rej'), ('Agathe', 'rej'), ('Lousia', 'rej'), ('Shu', 'rej'), ('Areion', 'rej'), ('Ceres', 'acc'), ('Areia', 'rej'), ('Lakinia', 'rej'), ('Tyche', 'acc'), ('Khepri', 'acc')])
#manually copy and paste the above lists and assign
#'acc' or 'rej' individually to accept or reject
# 72, 29 and 80 character rule (PEP) do not reach this ->|
#########1#########2#########3#########4#########5#########6#########7#2######9X
evallistm = OrderedDict([('Sol', 'rej'), ('Seth', 'rej'), ('Griffin', 'rej'),
('Amon', 'rej'), ('Thor', 'acc'), ('Hercules', 'acc'),
('Ladon', 'rej'), ('Odin', 'acc'), ('Hermes', 'acc'),
('Apollo', 'acc'), ('Osiris', 'acc'), ('Min', 'rej'),
('Clete', 'rej'), ('Zeus', 'acc'), ('Phoenix', 'acc'),
('Amen', 'rej'), ('Mars', 'acc'), ('Ares', 'acc'),
('Loki', 'acc'), ('Ran', 'rej'), ('Mercury', 'acc'),
('Tyr', 'acc'), ('Jupiter', 'acc'), ('Kore', 'rej'),
('Ra', 'acc'), ('Anubis', 'acc'), ('Helios', 'acc'),
('Poseidon', 'acc'), ('Makar', 'rej'),
('Pater', 'rej'), ('Amun', 'rej'), ('Fenris', 'acc'),
('Set', 'rej'), ('Horus', 'acc'), ('Megale', 'rej'),
('Aten', 'acc')])
evallistf = OrderedDict([('Athena', 'acc'), ('Delia', 'rej'), ('Minerva', 'acc'), ('Doris', 'rej'),
('Phoebe', 'rej'), ('Chloe', 'rej'), ('Diana', 'rej'),
('Flora', 'rej'), ('Sophia', 'rej'), ('Rhea', 'rej'),
('Venus', 'acc'), ('Vesta', 'acc'), ('Luna', 'rej'),
('Thalia', 'acc'), ('Lucina', 'rej'), ('Gerda', 'rej'),
('Eris', 'acc'), ('Artemis', 'acc'),
('Aphrodite', 'acc'), ('Isis', 'acc'), ('Clio', 'acc'),
('Persephone', 'acc'), ('Melaina', 'rej'),
('Shai', 'rej'), ('Andromeda', 'acc'),
('Lamia', 'acc'), ('Sia', 'rej'), ('Hera', 'acc'),
('Nanna', 'rej'), ('Urania', 'acc'), ('Gaia', 'acc'),
('Khloe', 'rej'), ('Chloris', 'rej'),
('Athene', 'acc'), ('Nike', 'acc'), ('Janus', 'rej'),
('Freyja', 'acc'), ('Valkyrie', 'acc'),
('Ourania', 'acc'), ('Juno', 'acc'), ('Vali', 'acc'),
('Holle', 'rej'), ('Cybele', 'acc'), ('Pelagia', 'rej'),
('Anat', 'rej'), ('Soteria', 'rej'), ('Pallas', 'acc'),
('Fortuna', 'rej'), ('Maat', 'acc'), ('Caliope', 'acc'),
('Chimera', 'acc'), ('Deianeira', 'rej'),
('Agathe', 'rej'), ('Lousia', 'rej'), ('Shu', 'rej'),
('Areion', 'rej'), ('Ceres', 'acc'), ('Areia', 'rej'),
('Lakinia', 'rej'), ('Tyche', 'acc'), ('Khepri', 'acc')])
# Note, Demeter and Nike taken from males' list and moved to females'; for some reason it spiked in males higher than in females
# Similarly, Saturn moved from females' to males'
# Test that all names have 'acc' or 'rej' values
final_m = []
final_f = []
names_not_validated = []
for item in evallistm:
if evallistm[item] not in ['acc', 'rej']:
names_not_validated.append(item)
elif evallistm[item] == 'acc':
final_m.append(item)
for item in evallistf:
if evallistf[item] not in ['acc', 'rej']:
names_not_validated.append(item)
elif evallistf[item] == 'acc':
final_f.append(item)
if len(names_not_validated) > 0:
print "The following names do not have 'acc' or 'rej' values: ", names_not_validated
raise exception("Names not validated")
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f
print 'Length: %d male, %d female\n' % (len(final_m), len(final_f))
Accepted male names: ['Thor', 'Hercules', 'Odin', 'Hermes', 'Apollo', 'Osiris', 'Zeus', 'Phoenix', 'Mars', 'Ares', 'Loki', 'Mercury', 'Tyr', 'Jupiter', 'Ra', 'Anubis', 'Helios', 'Poseidon', 'Fenris', 'Horus', 'Aten'] Accepted female names: ['Athena', 'Minerva', 'Venus', 'Vesta', 'Thalia', 'Eris', 'Artemis', 'Aphrodite', 'Isis', 'Clio', 'Persephone', 'Andromeda', 'Lamia', 'Hera', 'Urania', 'Gaia', 'Athene', 'Nike', 'Freyja', 'Valkyrie', 'Ourania', 'Juno', 'Vali', 'Cybele', 'Pallas', 'Maat', 'Caliope', 'Chimera', 'Ceres', 'Tyche', 'Khepri'] Length: 21 male, 31 female
# manually limit to nice round number
nice_round_number = 100 # if too high, there will be no change
final_m = final_m[:nice_round_number]
final_f = final_f[:nice_round_number]
print 'After manually resizing to nice round number of %d names each:' % (nice_round_number)
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f
After manually resizing to nice round number of 100 names each: Accepted male names: ['Thor', 'Hercules', 'Odin', 'Hermes', 'Apollo', 'Osiris', 'Zeus', 'Phoenix', 'Mars', 'Ares', 'Loki', 'Mercury', 'Tyr', 'Jupiter', 'Ra', 'Anubis', 'Helios', 'Poseidon', 'Fenris', 'Horus', 'Aten'] Accepted female names: ['Athena', 'Minerva', 'Venus', 'Vesta', 'Thalia', 'Eris', 'Artemis', 'Aphrodite', 'Isis', 'Clio', 'Persephone', 'Andromeda', 'Lamia', 'Hera', 'Urania', 'Gaia', 'Athene', 'Nike', 'Freyja', 'Valkyrie', 'Ourania', 'Juno', 'Vali', 'Cybele', 'Pallas', 'Maat', 'Caliope', 'Chimera', 'Ceres', 'Tyche', 'Khepri']
# BTW, here's those missassigned (it appears) genders:
print names[names.name == 'Saturn']
print names[names.name == 'Demeter']
print names[names.name == 'Nike']
name sex year_count year_min year_max pct_sum pct_max 49524 Saturn F 2 1996 2001 0.000563 0.000285 name sex year_count year_min year_max pct_sum pct_max 95309 Demeter M 1 1920 1920 0.00047 0.00047 name sex year_count year_min year_max pct_sum pct_max 18008 Nike F 18 1953 2013 0.006075 0.000514 76992 Nike M 15 1989 2013 0.005376 0.000951
%run download_and_process.py
# reduce names dataframe to those matching list
# print '--------------------\nDataframe names filtered to those that match list'
# print "%d records to begin." % (len(names))
names_listed = names[((names.name.isin(final_m) & (names.sex == 'M')) |
(names.name.isin(final_f) & (names.sex == 'F')) )].copy()
names_listed.sort('pct_max', ascending=False, inplace=True)
# print "%d records remaining." % (len(names_listed))
# listed_in_df = list(names_listed.name)
# print names_listed.head(10)
# listed_m = list(names[(names.sex == 'M') & (names.name.isin(final_m))]['name'])
# listed_f = list(names[(names.sex == 'F') & (names.name.isin(final_f))]['name'])
#reduce yob dataframe to those matching list
print '--------------------\nDataframe yob filtered to those that match list (count only)'
print "%d records to begin." % (len(yob))
yob_listed_m = yob[(yob.name.isin(final_m)) & (yob.sex == 'M')].copy()
yob_listed_m.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
yob_listed_f = yob[(yob.name.isin(final_f)) & (yob.sex == 'F')].copy()
yob_listed_f.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
print "%d records remaining." % (len(yob_listed))
# m and f totals
yob_listed_f_agg = pd.DataFrame(yob_listed_f.groupby('year').sum())[['births', 'pct']]
yob_listed_m_agg = pd.DataFrame(yob_listed_m.groupby('year').sum())[['births', 'pct']]
print '--------------------\nHead of total matching list per year, female'
print yob_listed_f_agg.head()
# print chart of m and f totals
print '\n'
# function to determine a nice y-axis limit a little above the maximum value
# rounds maximum y up to second-most-significant digit
def determine_y_limit(x):
significance = int(math.floor((math.log10(x))))
val = math.floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val
#data
xf = list(yob_listed_f_agg.index)
xm = list(yob_listed_m_agg.index)
plt.figure(figsize=(16,9))
plt.plot(xf, list(yob_listed_f_agg.pct), color="red")
plt.plot(xm, list(yob_listed_m_agg.pct), color="blue")
plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct)
+list(yob_listed_m_agg.pct))))
plt.xlim(1880, 2013)
plt.title('Top 10 mythological names, boy=blue, girl=red', fontsize = 20)
plt.xlabel("Year", fontsize = 14)
plt.ylabel("% of total births of that sex", fontsize = 14)
plt.show()
Data already downloaded. Data already extracted. Reading from pickle. Tail of dataframe 'yob': name sex births year pct ranked 1792086 Zyhier M 5 2013 0.000267 12995 1792087 Zylar M 5 2013 0.000267 12995 1792088 Zymari M 5 2013 0.000267 12995 1792089 Zymeer M 5 2013 0.000267 12995 1792090 Zyree M 5 2013 0.000267 12995 Tail of dataframe 'names': name sex year_count year_min year_max pct_sum pct_max 102685 Gross M 1 1925 1925 0.000538 0.000538 102686 Elik M 1 2012 2012 0.000318 0.000318 102687 Patrickjoseph M 1 1998 1998 0.000262 0.000262 102688 Southern M 1 1923 1923 0.000547 0.000547 102689 Jeon M 1 1999 1999 0.000261 0.000261 Tail of dataframe 'years': year births_f births_m births_t new_names unique_names_x sexratio \ 68 2008 1886765 2035811 3922576 2046 32483 107.899553 69 2009 1832276 1978582 3810858 1789 32210 107.984932 70 2010 1771846 1912915 3684761 1635 31593 107.961696 71 2011 1752198 1891800 3643998 1539 31412 107.967250 72 2012 1751866 1886972 3638838 1531 31212 107.712120 unique_names_y_x unique_names_x unique_names_y_x unique_names_x \ 68 32483 32483 32483 32483 69 32210 32210 32210 32210 70 31593 31593 31593 31593 71 31412 31412 31412 31412 72 31212 31212 31212 31212 unique_names_y_x unique_names_x unique_names_y_x unique_names_x \ 68 32483 32483 32483 32483 69 32210 32210 32210 32210 70 31593 31593 31593 31593 71 31412 31412 31412 31412 72 31212 31212 31212 31212 unique_names_y_x unique_names_x unique_names_y_x unique_names 68 32483 32483 32483 32483 69 32210 32210 32210 32210 70 31593 31593 31593 31593 71 31412 31412 31412 31412 72 31212 31212 31212 31212 -------------------- Dataframe yob filtered to those that match list (count only) 1792091 records to begin. 5045 records remaining. -------------------- Head of total matching list per year, female births pct year 1880 83 0.091216 1881 73 0.079388 1882 77 0.071395 1883 71 0.063211 1884 106 0.082157
# all names_listed, so we can see which ones to aggregate
# cutoff of 10 already done
print names_listed[names_listed.sex == 'M'].head(50)
print ''
print names_listed[names_listed.sex == 'F'].head(50)
name sex year_count year_min year_max pct_sum pct_max 71603 Phoenix M 30 1968 2013 0.467450 0.041866 66819 Odin M 70 1884 2013 0.181126 0.023885 75344 Ares M 19 1983 2013 0.059744 0.012183 68771 Apollo M 45 1965 2013 0.092614 0.010901 65453 Thor M 103 1904 2013 0.223267 0.005603 69242 Osiris M 42 1970 2013 0.067574 0.005343 76827 Loki M 15 1996 2013 0.030797 0.004595 70915 Zeus M 33 1973 2013 0.043210 0.004542 65996 Hercules M 92 1908 2013 0.089908 0.003887 73252 Mars M 24 1923 2013 0.012551 0.001431 82133 Helios M 8 2000 2013 0.003586 0.001069 68592 Hermes M 46 1924 2013 0.024318 0.000937 86346 Poseidon M 4 2010 2013 0.002543 0.000802 78294 Mercury M 13 1972 2012 0.004748 0.000705 79245 Tyr M 11 2002 2013 0.005111 0.000695 81705 Anubis M 8 2002 2012 0.002825 0.000516 79701 Ra M 11 1972 2013 0.003533 0.000510 79294 Jupiter M 11 1981 2013 0.003643 0.000455 98214 Aten M 1 2013 2013 0.000267 0.000267 91950 Fenris M 2 2011 2012 0.000529 0.000265 95454 Horus M 1 2011 2011 0.000264 0.000264 name sex year_count year_min year_max pct_sum pct_max 1307 Athena F 104 1902 2013 1.335460 0.083207 75 Minerva F 134 1880 2013 2.093317 0.069236 679 Vesta F 127 1880 2012 1.481232 0.044859 1049 Thalia F 112 1885 2013 0.696109 0.038711 5300 Isis F 51 1901 2013 0.578503 0.030404 2831 Eris F 74 1913 2013 0.105284 0.018735 676 Venus F 127 1887 2013 0.610904 0.015362 5884 Persephone F 48 1962 2013 0.077109 0.009616 18907 Freyja F 17 1994 2013 0.020269 0.004376 11122 Gaia F 29 1980 2013 0.038319 0.004224 3353 Artemis F 68 1915 2013 0.045047 0.003800 20819 Juno F 15 1919 2013 0.019627 0.003481 7054 Lamia F 42 1968 2013 0.059279 0.002726 5469 Clio F 50 1894 2013 0.047900 0.002709 10392 Urania F 31 1891 2002 0.016488 0.002696 6648 Andromeda F 44 1962 2013 0.034651 0.002303 9510 Hera F 34 1970 2013 0.019592 0.001785 20011 Valkyrie F 16 1992 2013 0.010817 0.001484 14786 Athene F 22 1909 2003 0.011416 0.001440 3992 Aphrodite F 61 1915 2013 0.036654 0.001409 22754 Cybele F 13 1963 2010 0.008608 0.000948 30199 Caliope F 7 1919 2013 0.003439 0.000916 20116 Ourania F 15 1963 2013 0.006050 0.000819 33277 Chimera F 6 1980 2001 0.002414 0.000659 42757 Ceres F 3 2005 2013 0.001351 0.000633 18008 Nike F 18 1953 2013 0.006075 0.000514 21706 Vali F 14 1952 1967 0.005284 0.000512 29183 Pallas F 8 1969 2007 0.003137 0.000508 29550 Maat F 8 1998 2013 0.002521 0.000403 58689 Tyche F 1 1999 1999 0.000282 0.000282 62227 Khepri F 1 2009 2009 0.000273 0.000273
# just take top 10
nice_round_number = 10 # if too high, there will be no change
final_m = final_m[:nice_round_number]
final_f = final_f[:nice_round_number]
print 'After manually resizing to nice round number of %d names each:' % (nice_round_number)
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f
After manually resizing to nice round number of 10 names each: Accepted male names: ['Thor', 'Hercules', 'Odin', 'Hermes', 'Apollo', 'Osiris', 'Zeus', 'Phoenix', 'Mars', 'Ares'] Accepted female names: ['Athena', 'Minerva', 'Venus', 'Vesta', 'Thalia', 'Eris', 'Artemis', 'Aphrodite', 'Isis', 'Clio']
names = final_m[:10]
sexes = ['M'] # can be length 1 or same length as names
yearstart=1880
yearend=2013
start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
sexes = sexes * len(names)
df_chart = df_chart[df_chart['name'].isin(names)]
df_chart['temp'] = 0
for row in range(len(df_chart)):
for pos in range(len(names)):
if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
#To keep more than one data set for charts in memory, change name of chart_1
chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = chart_1.columns[0]
for yr in range(yearstart, yearend+1): #inserts missing years
if yr not in chart_1.index:
#chart_1[col][yr] = 0.0
chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
chart_1 = chart_1.fillna(0)
chart_1.sort(inplace=True, ascending=True)
#a single function to make the four different kinds of charts
def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
#colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
from random import shuffle
shuffle(colors)
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(1980, endyear)
ax.set_ylabel(y_text, size = 13)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(startyear, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
#line graph
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '', # if '', will not be saved
)
names = final_f[:12]
sexes = ['F'] # can be length 1 or same length as names
yearstart=1880
yearend=2013
start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
sexes = sexes * len(names)
df_chart = df_chart[df_chart['name'].isin(names)]
df_chart['temp'] = 0
for row in range(len(df_chart)):
for pos in range(len(names)):
if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
#To keep more than one data set for charts in memory, change name of chart_1
chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = chart_1.columns[0]
for yr in range(yearstart, yearend+1): #inserts missing years
if yr not in chart_1.index:
#chart_1[col][yr] = 0.0
chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
chart_1 = chart_1.fillna(0)
chart_1.sort(inplace=True, ascending=True)
#a single function to make the four different kinds of charts
def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
#colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
from random import shuffle
shuffle(colors)
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(1980, endyear)
ax.set_ylabel(y_text, size = 13)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(startyear, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
#line graph
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title="10 most popular mythological girls' names, 2914-2013",
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '', # if '', will not be saved
)
names = final_f[:10]
sexes = ['F'] # can be length 1 or same length as names
yearstart=1880 # for data, not graph
yearend=2013
xmin = 1940
start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
sexes = sexes * len(names)
df_chart = df_chart[df_chart['name'].isin(names)]
df_chart['temp'] = 0
for row in range(len(df_chart)):
for pos in range(len(names)):
if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
#To keep more than one data set for charts in memory, change name of chart_1
chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = chart_1.columns[0]
for yr in range(yearstart, yearend+1): #inserts missing years
if yr not in chart_1.index:
#chart_1[col][yr] = 0.0
chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
chart_1 = chart_1.fillna(0)
chart_1.sort(inplace=True, ascending=True)
#a single function to make the four different kinds of charts
def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
#colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
from random import shuffle
shuffle(colors)
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(xmin, endyear)
ax.set_ylabel(y_text, size = 13)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(xmin, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(xmin, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,16.67), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(xmin, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
#stream graph
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '', # if '', will not be saved
)
names = final_m[:10]
sexes = ['M'] # can be length 1 or same length as names
yearstart=1880 # for data, not graph
yearend=2013
xmin = 1940
start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
sexes = sexes * len(names)
df_chart = df_chart[df_chart['name'].isin(names)]
df_chart['temp'] = 0
for row in range(len(df_chart)):
for pos in range(len(names)):
if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]
#To keep more than one data set for charts in memory, change name of chart_1
chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))
col = chart_1.columns[0]
for yr in range(yearstart, yearend+1): #inserts missing years
if yr not in chart_1.index:
#chart_1[col][yr] = 0.0
chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))
chart_1 = chart_1.fillna(0)
chart_1.sort(inplace=True, ascending=True)
#a single function to make the four different kinds of charts
def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
groupedlist = [], baseline='sym', png_name=''):
dataframe = df.copy()
startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)
legend_size = 0.01
has_male = False
has_female = False
has_both = False
max_y = 0
for name, sex in dataframe.columns:
max_y = max(max_y, dataframe[(name, sex)].max())
final_name = name
if sex == 'M': has_male = True
if sex == 'F': has_female = True
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[(name, sex)].iloc[row] = newvalues[row]
if has_male and has_female:
y_text = "% of births of indicated sex"
has_both = True
elif has_male:
y_text = "Percent of male births"
else:
y_text = "Percent of female births"
num_series = len(dataframe.columns)
if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
#colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
from random import shuffle
shuffle(colors)
num_colors = len(colors)
if num_series > num_colors:
print "Warning: colors will be repeated."
if title == '':
if num_series == 1:
title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
else:
title = "Popularity of baby names in U.S., %s" % (yearstr)
x_values = range(startyear, endyear + 1)
y_zeroes = [0] * (endyear - startyear)
if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for name, sex in dataframe.columns:
color = colors[counter % num_colors]
counter += 1
if has_both:
label = "%s (%s)" % (name, sex)
else:
label = name
ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
ax.set_ylim(0,determine_y_limit(max_y))
ax.set_xlim(xmin, endyear)
ax.set_ylabel(y_text, size = 13)
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)
if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
current_ymax = dataframe[(name, sex)].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(xmin, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for name, sex in dataframe.columns:
if sex=='M':
sex_label = 'male'
else:
sex_label = 'female'
label = "Percent of %s births for %s" % (sex_label, name)
axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(xmin, endyear)
axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
plt.subplots_adjust(hspace=0.1)
counter += 1
if form == 'stream':
plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(xmin, endyear)
if has_both:
yaxtext = 'Percent of births of indicated sex (scale: '
elif has_male:
yaxtext = 'Percent of male births (scale: '
else:
yaxtext = 'Percent of female births (scale: '
scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
namelist = []
for name, sex in dataframe.columns:
if has_both:
namelist.append('%s (%s)' % (name, sex))
else:
namelist.append(name)
plt.legend(legendProxies, namelist, loc=3, ncol=2)
plt.tick_params(\
axis='y',
which='both', # major and minor ticks
left='off',
right='off',
labelleft='off')
plt.show()
if png_name != '':
filename = save_path + "/" + png_name + ".png"
plt.savefig(filename)
plt.close()
#stream graph
make_chart(df=chart_1,
form='stream', # line , subplots_auto , subplots_same , stream
title='',
colors= [],
smoothing=0,
baseline='sym', # zero , sym , wiggle , weighted_wiggle
png_name = '', # if '', will not be saved
)