#import dependencies
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
The code below is cited from Brandon Rhodes. His Pandas Tutorial is very helpful and teaches the core features of Pandas library.
I like using the css style into my projects because it looks nice in the eye to differ data, index and header. It comes very handy especially when you have multiple indexes with headers.
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))
# assign dataframe to a variable called df
df = pd.read_csv('NationalNames.csv', index_col = 'Id')
df.head()
Name | Year | Gender | Count | |
---|---|---|---|---|
Id | ||||
1 | Mary | 1880 | F | 7065 |
2 | Anna | 1880 | F | 2604 |
3 | Emma | 1880 | F | 2003 |
4 | Elizabeth | 1880 | F | 1939 |
5 | Minnie | 1880 | F | 1746 |
#get some informatian about our dataset
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1825433 entries, 1 to 1825433 Data columns (total 4 columns): Name object Year int64 Gender object Count int64 dtypes: int64(2), object(2) memory usage: 69.6+ MB
There are 1,825,433 rows in our dataset.
df['Year'].min()
1880
df['Year'].max()
2014
Our dataset has names from 1880 to 2014.
# total number of female names
f = df[df.Gender == 'F']
f = f['Count'].sum()
f
167070477
#total number of male names
m = df[df.Gender == 'M']
m = m['Count'].sum()
m
170064949
#Total number of names
f + m
337135426
#range of male and female names
m - f
2994472
#How many baby named Nicole in the history
Nicole = df[df['Name'] == 'Nicole']
Nicole = Nicole['Count'].sum()
Nicole
581900
df.query('Name=="Nicole"')[['Year', 'Count']].groupby('Year').sum().plot(grid= True, figsize = (10,5))
plt.xlabel('Year')
plt.ylabel('Number of Names')
plt.title('Number of Nicole in the history of the US')
plt.xlim(1880,2016);
#How many baby named Numan in the history
Numan = df[df['Name'] == 'Numan']
Numan = Numan['Count'].sum()
Numan
69
df[df['Name'] == 'Numan'].groupby('Year').sum().plot(kind='bar', figsize = (10,5))
plt.xlabel('Year')
plt.ylabel('Number of Names')
plt.title('Number of Numans in the history of the US');
#How many baby named Esma in the history
Esma = df[df['Name'] == 'Esma']
Esma = Esma['Count'].sum()
Esma
595
df.query('Name=="Esma"')[['Year', 'Count']].groupby('Year').sum().plot(grid= True, figsize = (10,5))
plt.xlabel('Year')
plt.ylabel('Number of Names')
plt.title('Number of Esma in the history of the US')
plt.xlim(1880,2016);