import addutils.toc ; addutils.toc.js(ipy_notebook=True)
import numpy as np
import pandas as pd
import addutils
from IPython.display import display
import bokeh.plotting as bk
bk.output_notebook()
addutils.css_notebook()
We downloaded statistics about baby names choosen over years in the U.S. from: http://www.babycenter.com/baby-names and we stored them on our example data folder.
dataFolder = 'temp/baby_names/'
columnNames = ['name', 'sex', 'births']
names1880 = pd.read_csv(dataFolder+'yob1880.txt', names=columnNames)
names1880.head()
name | sex | births | |
---|---|---|---|
0 | Mary | F | 7065 |
1 | Anna | F | 2604 |
2 | Emma | F | 2003 |
3 | Elizabeth | F | 1939 |
4 | Minnie | F | 1746 |
This shows some of the names choosen during 1880.
Now we want to read all the files in the years that spaces from 1880 to 2011.
years = range(1880, 2012)
parts = []
for year in years:
path = '{0}yob{1}.txt'.format(dataFolder, year)
frame = pd.read_csv(path, names=columnNames)
frame['year'] = year
parts.append(frame)
Now parts is a python
list
containing pandas.DataFrame
(s). Let's create a single DataFrame
containing all the names.
names = pd.concat(parts, ignore_index=True)
names[::10**5]
name | sex | births | year | |
---|---|---|---|---|
0 | Mary | F | 7065 | 1880 |
100000 | Ernie | F | 13 | 1912 |
200000 | Lemoyne | M | 9 | 1922 |
300000 | Derrell | M | 43 | 1932 |
400000 | Valentine | M | 56 | 1943 |
500000 | Neal | M | 968 | 1953 |
600000 | Konni | F | 14 | 1962 |
700000 | Howard | F | 18 | 1970 |
800000 | Tomecca | F | 6 | 1976 |
900000 | Martrell | M | 8 | 1981 |
1000000 | Clearence | M | 6 | 1986 |
1100000 | Indiana | F | 15 | 1991 |
1200000 | Aminata | F | 30 | 1995 |
1300000 | Yutaro | M | 8 | 1998 |
1400000 | Mc | F | 14 | 2002 |
1500000 | Marshayla | F | 7 | 2005 |
1600000 | Treonna | F | 9 | 2008 |
1700000 | Kellsie | F | 13 | 2011 |
pandas.concat
concatenates pandas objects along a particular axis. If the optional parameter ignore_insex
is True, concat
won't use index values on the concatenation. Values from 0
to n-1
will be used instead.
DataFrame.pivot_table
creates a spreadsheet-style pivot table as a DataFrame. aggfunc
parameter specifies a list of aggregation functions to use on elements, margins
tells if grandtotal/subtotals are to be added to all columns/rows.
from bokeh.models.ranges import Range1d
totalBirths = names.pivot_table('births', index='year', columns='sex',
aggfunc=sum, margins=False)
#display(totalBirths.head())
#totalBirths[['F', 'M']][:-1].plot(title='Total births by sex and year')
fig = bk.figure(plot_width=750, plot_height=300, title=None)
fig.line(x=totalBirths.index, y=totalBirths['F'], legend='F', line_color='magenta')
fig.line(x=totalBirths.index, y=totalBirths['M'], legend='M', line_color='royalblue')
fig.legend.location = 'bottom_right'
fig.xaxis.axis_label = 'Year'
fig.yaxis.axis_label = 'Total births'
fig.yaxis[0].formatter.use_scientific = False
bk.show(fig)
Let's see a couple of example about splitting data.
In the first example we are going to view the number of births grouped by year and sex.
names.groupby(['year', 'sex'])['births'].sum().head()
year sex 1880 F 90994 M 110492 1881 F 91955 M 100747 1882 F 107851 Name: births, dtype: int64
The second example shows how to split the names in two groups: Boys and Girls.
boys = names[names.sex == 'M']
girls = names[names.sex == 'F']
display(boys[:2000:100])
name | sex | births | year | |
---|---|---|---|---|
942 | John | M | 9655 | 1880 |
1042 | Perry | M | 134 | 1880 |
1142 | Clayton | M | 60 | 1880 |
1242 | Judson | M | 31 | 1880 |
1342 | Wilmer | M | 19 | 1880 |
1442 | Rubin | M | 14 | 1880 |
1542 | Alois | M | 10 | 1880 |
1642 | Fayette | M | 8 | 1880 |
1742 | Toney | M | 7 | 1880 |
1842 | Titus | M | 6 | 1880 |
1942 | Leonidas | M | 5 | 1880 |
2980 | Tom | M | 349 | 1881 |
3080 | Lester | M | 87 | 1881 |
3180 | Elias | M | 41 | 1881 |
3280 | Hans | M | 23 | 1881 |
3380 | Aubrey | M | 15 | 1881 |
3480 | Ford | M | 11 | 1881 |
3580 | Rafael | M | 9 | 1881 |
3680 | Handy | M | 7 | 1881 |
3780 | Orla | M | 6 | 1881 |
We can see how many boys with a specific name were born each year.
boys[boys['name']=='Jayden']
name | sex | births | year | |
---|---|---|---|---|
824344 | Jayden | M | 7 | 1977 |
843205 | Jayden | M | 6 | 1978 |
899512 | Jayden | M | 9 | 1981 |
940021 | Jayden | M | 6 | 1983 |
960469 | Jayden | M | 5 | 1984 |
977725 | Jayden | M | 10 | 1985 |
997266 | Jayden | M | 14 | 1986 |
1019074 | Jayden | M | 11 | 1987 |
1040265 | Jayden | M | 16 | 1988 |
1062845 | Jayden | M | 22 | 1989 |
1087121 | Jayden | M | 25 | 1990 |
1111407 | Jayden | M | 38 | 1991 |
1136484 | Jayden | M | 45 | 1992 |
1161561 | Jayden | M | 77 | 1993 |
1187019 | Jayden | M | 159 | 1994 |
1212851 | Jayden | M | 239 | 1995 |
1238981 | Jayden | M | 294 | 1996 |
1265586 | Jayden | M | 387 | 1997 |
1292859 | Jayden | M | 620 | 1998 |
1320950 | Jayden | M | 1230 | 1999 |
1350145 | Jayden | M | 1821 | 2000 |
1380163 | Jayden | M | 2833 | 2001 |
1410493 | Jayden | M | 3853 | 2002 |
1441364 | Jayden | M | 5542 | 2003 |
1472909 | Jayden | M | 6920 | 2004 |
1505279 | Jayden | M | 8244 | 2005 |
1538659 | Jayden | M | 9610 | 2006 |
1573173 | Jayden | M | 15206 | 2007 |
1607962 | Jayden | M | 17105 | 2008 |
1642694 | Jayden | M | 17217 | 2009 |
1676954 | Jayden | M | 17101 | 2010 |
1710637 | Jayden | M | 16861 | 2011 |
bBirths = boys.pivot_table('births', index='year', columns='name',
aggfunc=sum, margins=False)
subset = bBirths[['Ray', 'Elvis', 'Sam', 'John', 'Marvin', 'Bob']]
plots = []
for name in subset.columns:
fig = bk.figure(plot_height=200, plot_width=700, title=None)
fig.line(x=np.asarray(subset.index), y=np.asarray(subset[name]),
line_color='black', legend=name)
plots.append([fig])
bk.show(bk.gridplot(plots))
# Or directly using Pandas (which uses Matplotlib, not Bokeh):
#subset.plot(subplots=True, figsize=(12, 10), grid=False,
# title="Number of births per year")
Now we are going to add a column
named 'prop` that shows the ratio: $\frac{\text{children with a specific name}}{\text{total children}}$
def add_prop(group):
births = group['births']
group['prop'] = births/float(births.sum())
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
display(names.head())
name | sex | births | year | prop | |
---|---|---|---|---|---|
0 | Mary | F | 7065 | 1880 | 0.077642 |
1 | Anna | F | 2604 | 1880 | 0.028617 |
2 | Emma | F | 2003 | 1880 | 0.022012 |
3 | Elizabeth | F | 1939 | 1880 | 0.021309 |
4 | Minnie | F | 1746 | 1880 | 0.019188 |
Let's check our calculations by verifying that the sum of all porportions by sex must be equal (or at least close) to 1.
np.allclose(names.groupby(['year', 'sex'])['prop'].sum(), 1)
True
Now we want to extract the top names for each sex/year combination.
def get_top(group, topNumber):
return group.sort_values(by='births', ascending=False)[:topNumber]
grouped = names.groupby(['year', 'sex'])
topNames = grouped.apply(get_top, topNumber=10)
# rename indexes to avoid warning; index and columns should have different names
topNames.index.rename(['year_', 'sex_', None], inplace=True)
topNames[:50]
name | sex | births | year | prop | |||
---|---|---|---|---|---|---|---|
year_ | sex_ | ||||||
1880 | F | 0 | Mary | F | 7065 | 1880 | 0.077642 |
1 | Anna | F | 2604 | 1880 | 0.028617 | ||
2 | Emma | F | 2003 | 1880 | 0.022012 | ||
3 | Elizabeth | F | 1939 | 1880 | 0.021309 | ||
4 | Minnie | F | 1746 | 1880 | 0.019188 | ||
5 | Margaret | F | 1578 | 1880 | 0.017342 | ||
6 | Ida | F | 1472 | 1880 | 0.016177 | ||
7 | Alice | F | 1414 | 1880 | 0.015539 | ||
8 | Bertha | F | 1320 | 1880 | 0.014506 | ||
9 | Sarah | F | 1288 | 1880 | 0.014155 | ||
M | 942 | John | M | 9655 | 1880 | 0.087382 | |
943 | William | M | 9533 | 1880 | 0.086278 | ||
944 | James | M | 5927 | 1880 | 0.053642 | ||
945 | Charles | M | 5348 | 1880 | 0.048402 | ||
946 | George | M | 5126 | 1880 | 0.046392 | ||
947 | Frank | M | 3242 | 1880 | 0.029341 | ||
948 | Joseph | M | 2632 | 1880 | 0.023821 | ||
949 | Thomas | M | 2534 | 1880 | 0.022934 | ||
950 | Henry | M | 2444 | 1880 | 0.022119 | ||
951 | Robert | M | 2415 | 1880 | 0.021857 | ||
1881 | F | 2000 | Mary | F | 6919 | 1881 | 0.075243 |
2001 | Anna | F | 2698 | 1881 | 0.029340 | ||
2002 | Emma | F | 2034 | 1881 | 0.022120 | ||
2003 | Elizabeth | F | 1852 | 1881 | 0.020140 | ||
2004 | Margaret | F | 1658 | 1881 | 0.018031 | ||
2005 | Minnie | F | 1653 | 1881 | 0.017976 | ||
2006 | Ida | F | 1439 | 1881 | 0.015649 | ||
2007 | Annie | F | 1326 | 1881 | 0.014420 | ||
2008 | Bertha | F | 1324 | 1881 | 0.014398 | ||
2009 | Alice | F | 1308 | 1881 | 0.014224 | ||
M | 2938 | John | M | 8769 | 1881 | 0.087040 | |
2939 | William | M | 8524 | 1881 | 0.084608 | ||
2940 | James | M | 5442 | 1881 | 0.054016 | ||
2941 | George | M | 4664 | 1881 | 0.046294 | ||
2942 | Charles | M | 4636 | 1881 | 0.046016 | ||
2943 | Frank | M | 2834 | 1881 | 0.028130 | ||
2944 | Joseph | M | 2456 | 1881 | 0.024378 | ||
2945 | Henry | M | 2339 | 1881 | 0.023217 | ||
2946 | Thomas | M | 2282 | 1881 | 0.022651 | ||
2947 | Edward | M | 2177 | 1881 | 0.021609 | ||
1882 | F | 3935 | Mary | F | 8149 | 1882 | 0.075558 |
3936 | Anna | F | 3143 | 1882 | 0.029142 | ||
3937 | Emma | F | 2303 | 1882 | 0.021354 | ||
3938 | Elizabeth | F | 2187 | 1882 | 0.020278 | ||
3939 | Minnie | F | 2004 | 1882 | 0.018581 | ||
3940 | Margaret | F | 1821 | 1882 | 0.016884 | ||
3941 | Ida | F | 1673 | 1882 | 0.015512 | ||
3942 | Alice | F | 1542 | 1882 | 0.014298 | ||
3943 | Bertha | F | 1508 | 1882 | 0.013982 | ||
3944 | Annie | F | 1492 | 1882 | 0.013834 |
This is our concluding example and we want to measure the increasing in name diversity.
from bokeh.models.ranges import Range1d
diversity = topNames.pivot_table('prop', index='year', columns='sex', aggfunc=sum)
fig = bk.figure(plot_width=750, plot_height=300, title=None)
fig.line(x=diversity.index, y=diversity['F'], line_color='green', legend='F')
fig.line(x=diversity.index, y=diversity['M'], line_color='blue', legend='M')
fig.y_range = Range1d(0, 1.2)
bk.show(fig)
# Or, using directly Pandas' "plot" method (which calls Matplotlib, not Bokeh)
# diversity.plot(title='Sum of diversity.prop by year and sex',
# yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.