import pandas as pd from pandas import DataFrame, Series import urllib2 import zipfile import os def download_name_data(): name_data_url = "http://www.ssa.gov/oact/babynames/names.zip" zip_data = urllib2.urlopen(name_data_url).read() with open("names.zip", "wb") as zipfile: zipfile.write(zip_data) print "Successfully saved names.zip" def extract_zip(zip_file, output_dir): zfile = zipfile.ZipFile(zip_file) try: os.mkdir(output_dir) except OSError, e: pass # Directory may already exist for filename in zfile.namelist(): outfile = open(output_dir + '/' + filename, 'wb') outfile.write(zfile.read(filename)) outfile.close() print "Extracted", zip_file, "to", output_dir def parse_names_data(names_dir): years = range(1880, 2012+1) parts = [] for year in years: path = names_dir + ("/yob%d.txt" % year) frame = pd.read_csv(path, names=['name', 'sex', 'births']) frame['year'] = year parts.append(frame) names = pd.concat(parts, ignore_index=True) return names #download_name_data() names_data_dir = "names_data" extract_zip("names.zip", names_data_dir) names = parse_names_data(names_data_dir) names.head() names.tail() names[names.name == 'Caleb'] def total_number_of_births(name): return names[names.name == name].births.sum() print "Total number of people born with each name between 1880 and 2012:" name_series = Series(['Caleb','Whitney','Hastin','Rafe','Dan','Mary','Leah','Wesley','Ambria','Tradley']) births_list = [] for name in name_series: births_list.append(total_number_of_births(name)) births_by_name_frame = DataFrame({'births since 1880 in the US': births_list}, index=name_series) births_by_name_frame total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum) total_births.head() total_births.plot(figsize=(10,5)) births_by_year_by_name = names.pivot_table('births', rows='name', cols='year', aggfunc=sum) calebs_by_year = births_by_year_by_name.ix['Caleb'] calebs_by_year.plot(title="Caleb's born by year", figsize=(10,5)) births_by_year_by_name.ix['Whitney'].plot(figsize=(10,5)) births_by_year_by_name.ix['Bertha'].plot(figsize=(10,5))