In [1]:
import pandas as pd
from pandas import DataFrame, Series
import urllib2
import zipfile
import os

def download_name_data():
    name_data_url = "http://www.ssa.gov/oact/babynames/names.zip"
    zip_data = urllib2.urlopen(name_data_url).read()
    with open("names.zip", "wb") as zipfile:
        zipfile.write(zip_data)
    print "Successfully saved names.zip"

def extract_zip(zip_file, output_dir):
    zfile = zipfile.ZipFile(zip_file)
    
    try:
        os.mkdir(output_dir)
    except OSError, e:
        pass # Directory may already exist

    for filename in zfile.namelist():
        outfile = open(output_dir + '/' + filename, 'wb')
        outfile.write(zfile.read(filename))
        outfile.close()
    print "Extracted", zip_file, "to", output_dir

def parse_names_data(names_dir):
    years = range(1880, 2012+1)
    parts = []
    for year in years:
        path = names_dir + ("/yob%d.txt" % year)
        frame = pd.read_csv(path, names=['name', 'sex', 'births'])
        frame['year'] = year
        parts.append(frame)
    names = pd.concat(parts, ignore_index=True)
    return names
In [2]:
#download_name_data()
names_data_dir = "names_data"
extract_zip("names.zip", names_data_dir)
names = parse_names_data(names_data_dir)
Extracted names.zip to names_data
In [3]:
names.head()
Out[3]:
name sex births year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
In [4]:
names.tail()
Out[4]:
name sex births year
1758725 Zylin M 5 2012
1758726 Zymari M 5 2012
1758727 Zyrin M 5 2012
1758728 Zyrus M 5 2012
1758729 Zytaevius M 5 2012
In [5]:
names[names.name == 'Caleb']
Out[5]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 166 entries, 1268 to 1744599
Data columns (total 4 columns):
name      166  non-null values
sex       166  non-null values
births    166  non-null values
year      166  non-null values
dtypes: int64(2), object(2)
In [34]:
def total_number_of_births(name):
    return names[names.name == name].births.sum()

print "Total number of people born with each name between 1880 and 2012:"
name_series = Series(['Caleb','Whitney','Hastin','Rafe','Dan','Mary','Leah','Wesley','Ambria','Tradley'])
births_list = []
for name in name_series:
    births_list.append(total_number_of_births(name))

births_by_name_frame = DataFrame({'births since 1880 in the US': births_list}, index=name_series)
births_by_name_frame
Total number of people born with each name between 1880 and 2012:
Out[34]:
births since 1880 in the US
Caleb 234304
Whitney 99268
Hastin 43
Rafe 1632
Dan 105847
Mary 4124778
Leah 194257
Wesley 198311
Ambria 1855
Tradley 0
In [7]:
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births.head()
Out[7]:
sex F M
year
1880 90993 110491
1881 91955 100746
1882 107850 113687
1883 112322 104630
1884 129022 114445
In [8]:
total_births.plot(figsize=(10,5))
Out[8]:
<matplotlib.axes.AxesSubplot at 0x1111943d0>
In [9]:
births_by_year_by_name = names.pivot_table('births', rows='name', cols='year', aggfunc=sum)
In [10]:
calebs_by_year = births_by_year_by_name.ix['Caleb']
In [11]:
calebs_by_year.plot(title="Caleb's born by year", figsize=(10,5))
Out[11]:
<matplotlib.axes.AxesSubplot at 0x11a60af50>
In [12]:
births_by_year_by_name.ix['Whitney'].plot(figsize=(10,5))
Out[12]:
<matplotlib.axes.AxesSubplot at 0x11a4a65d0>
In [13]:
births_by_year_by_name.ix['Bertha'].plot(figsize=(10,5))
Out[13]:
<matplotlib.axes.AxesSubplot at 0x110a0c210>