Analyze the baby names dataset using pandas
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# Load dataset
import zipfile
with zipfile.ZipFile('../datasets/baby-names2.csv.zip', 'r') as z:
f = z.open('baby-names2.csv')
names = pd.io.parsers.read_table(f, sep=',')
names.head()
year | name | prop | sex | soundex | |
---|---|---|---|---|---|
0 | 1880 | John | 0.081541 | boy | J500 |
1 | 1880 | William | 0.080511 | boy | W450 |
2 | 1880 | James | 0.050057 | boy | J520 |
3 | 1880 | Charles | 0.045167 | boy | C642 |
4 | 1880 | George | 0.043292 | boy | G620 |
names[names.year == 1993].head()
year | name | prop | sex | soundex | |
---|---|---|---|---|---|
113000 | 1993 | Michael | 0.024010 | boy | M240 |
113001 | 1993 | Christopher | 0.018572 | boy | C623 |
113002 | 1993 | Matthew | 0.017332 | boy | M300 |
113003 | 1993 | Joshua | 0.016268 | boy | J200 |
113004 | 1993 | Tyler | 0.014439 | boy | T460 |
boys = names[names.sex == 'boy'].copy()
girls = names[names.sex == 'girl'].copy()
william = boys[boys['name']=='William']
plt.plot(range(william.shape[0]), william['prop'])
plt.xticks(range(william.shape[0])[::5], william['year'].values[::5], rotation='vertical')
plt.ylim([0, 0.1])
plt.show()
Daniel = boys[boys['name']=='Daniel']
plt.plot(range(Daniel.shape[0]), Daniel['prop'])
plt.xticks(range(Daniel.shape[0])[::5], Daniel['year'].values[::5], rotation='vertical')
plt.ylim([0, 0.1])
plt.show()
Which has been the most popular boy name every decade?
Which has been the most popular girl name?
What is the most popular new girl name? (new is a name that appears only in the 2000's)