The first data file has 2000-2010
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from os.path import join
cwd = os.getcwd()
data_directory = join(cwd, '..', 'Data storage')
The sex, origin, race, and age columns are ALL when they have values of 0
Not clear if these are beginning or end of year values.
https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/
path = os.path.join(data_directory, 'Population data',
'st-est00int-alldata.csv')
pop1 = pd.read_csv(path)
pop1.head()
REGION | DIVISION | STATE | NAME | SEX | ORIGIN | RACE | AGEGRP | ESTIMATESBASE2000 | POPESTIMATE2000 | ... | POPESTIMATE2002 | POPESTIMATE2003 | POPESTIMATE2004 | POPESTIMATE2005 | POPESTIMATE2006 | POPESTIMATE2007 | POPESTIMATE2008 | POPESTIMATE2009 | CENSUS2010POP | POPESTIMATE2010 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | United States | 0 | 0 | 0 | 0 | 281424600 | 282162411 | ... | 287625193 | 290107933 | 292805298 | 295516599 | 298379912 | 301231207 | 304093966 | 306771529 | 308745538 | 309349689 |
1 | 0 | 0 | 0 | United States | 0 | 0 | 0 | 1 | 19176154 | 19178293 | ... | 19429192 | 19592446 | 19785885 | 19917400 | 19938883 | 20125962 | 20271127 | 20244518 | 20201362 | 20200529 |
2 | 0 | 0 | 0 | United States | 0 | 0 | 0 | 2 | 20549855 | 20463852 | ... | 19872417 | 19620851 | 19454237 | 19389067 | 19544688 | 19714611 | 19929602 | 20182499 | 20348657 | 20382409 |
3 | 0 | 0 | 0 | United States | 0 | 0 | 0 | 3 | 20528425 | 20637696 | ... | 21261421 | 21415353 | 21411680 | 21212579 | 21033138 | 20841042 | 20706655 | 20660564 | 20677194 | 20694011 |
4 | 0 | 0 | 0 | United States | 0 | 0 | 0 | 4 | 20218782 | 20294955 | ... | 20610370 | 20797166 | 21102552 | 21486214 | 21807709 | 22067816 | 22210880 | 22192810 | 22040343 | 21959087 |
5 rows × 21 columns
pop1 = pop1.loc[(pop1['SEX'] == 0) &
(pop1['ORIGIN'] == 0) &
(pop1['RACE'] == 0) &
(pop1['AGEGRP'] == 0), :]
# Column names for population estimate
est_cols = ['POPESTIMATE{}'.format(x) for x in range(2000, 2011)]
# Melt the wide-form data into a tidy dataframe
pop1_tidy = pd.melt(pop1, id_vars='NAME',
value_vars=est_cols, var_name='Year',
value_name='Population')
pop1_tidy.head()
NAME | Year | Population | |
---|---|---|---|
0 | United States | POPESTIMATE2000 | 282162411 |
1 | Alabama | POPESTIMATE2000 | 4452173 |
2 | Alaska | POPESTIMATE2000 | 627963 |
3 | Arizona | POPESTIMATE2000 | 5160586 |
4 | Arkansas | POPESTIMATE2000 | 2678588 |
def map_year(x):
'Return last 4 characters (the year)'
year = x[-4:]
return int(year)
pop1_tidy['Year'] = pop1_tidy['Year'].map(map_year)
The values shown below are ever slightly different than those listed in the later dataset.
pop1_tidy.loc[pop1_tidy['Year'] == 2010].head()
NAME | Year | Population | |
---|---|---|---|
520 | United States | 2010 | 309349689 |
521 | Alabama | 2010 | 4785298 |
522 | Alaska | 2010 | 713985 |
523 | Arizona | 2010 | 6413737 |
524 | Arkansas | 2010 | 2921606 |
pop1_tidy.head()
NAME | Year | Population | |
---|---|---|---|
0 | United States | 2000 | 282162411 |
1 | Alabama | 2000 | 4452173 |
2 | Alaska | 2000 | 627963 |
3 | Arizona | 2000 | 5160586 |
4 | Arkansas | 2000 | 2678588 |
pop1_tidy.tail()
NAME | Year | Population | |
---|---|---|---|
567 | Virginia | 2010 | 8024617 |
568 | Washington | 2010 | 6744496 |
569 | West Virginia | 2010 | 1853973 |
570 | Wisconsin | 2010 | 5691047 |
571 | Wyoming | 2010 | 564460 |
pop1_tidy.columns = ['State', 'Year', 'Population']
https://www.census.gov/data/tables/2016/demo/popest/state-total.html
path = os.path.join(data_directory, 'Population data', 'nst-est2016-01.xlsx')
pop2 = pd.read_excel(path, header=3, parse_cols='A, D:J', skip_footer=7)
pop2.head()
2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | |
---|---|---|---|---|---|---|---|
United States | 309348193 | 311663358 | 313998379 | 316204908 | 318563456 | 320896618 | 323127513 |
Northeast | 55388056 | 55632766 | 55829059 | 55988771 | 56116791 | 56184737 | 56209510 |
Midwest | 66978602 | 67153331 | 67332320 | 67543948 | 67726368 | 67838387 | 67941429 |
South | 114863114 | 116061801 | 117299171 | 118424320 | 119696311 | 121039206 | 122319574 |
West | 72118421 | 72815460 | 73537829 | 74247869 | 75023986 | 75834288 | 76657000 |
pop2.tail()
2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | |
---|---|---|---|---|---|---|---|
.Virginia | 8025773 | 8110035 | 8192048 | 8262692 | 8317372 | 8367587 | 8411808 |
.Washington | 6743226 | 6822520 | 6895226 | 6968006 | 7054196 | 7160290 | 7288000 |
.West Virginia | 1854230 | 1854972 | 1856560 | 1853231 | 1848514 | 1841053 | 1831102 |
.Wisconsin | 5690263 | 5709640 | 5726177 | 5742854 | 5758377 | 5767891 | 5778708 |
.Wyoming | 564513 | 567725 | 576765 | 582684 | 583642 | 586555 | 585501 |
drop_rows = ['Northeast', 'Midwest', 'South', 'West']
pop2.drop(drop_rows, inplace=True)
pop2.index = pop2.index.str.strip('.')
pop2.head()
2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | |
---|---|---|---|---|---|---|---|
United States | 309348193 | 311663358 | 313998379 | 316204908 | 318563456 | 320896618 | 323127513 |
Alabama | 4785492 | 4799918 | 4815960 | 4829479 | 4843214 | 4853875 | 4863300 |
Alaska | 714031 | 722713 | 731089 | 736879 | 736705 | 737709 | 741894 |
Arizona | 6408312 | 6467163 | 6549634 | 6624617 | 6719993 | 6817565 | 6931071 |
Arkansas | 2921995 | 2939493 | 2950685 | 2958663 | 2966912 | 2977853 | 2988248 |
pop2.columns
Int64Index([2010, 2011, 2012, 2013, 2014, 2015, 2016], dtype='int64')
pop2_tidy = pd.melt(pop2.reset_index(), id_vars='index',
value_vars=range(2010, 2017), value_name='Population',
var_name='Year')
pop2_tidy.columns = ['State', 'Year', 'Population']
pop_total = pd.concat([pop1_tidy, pop2_tidy])
The overlapping 2010 values are different, but just barely. I'm going to re-combine the datasets and keep values from the second dataset.
pop_total.loc[pop_total['Year']==2010].sort_values('State')
State | Year | Population | |
---|---|---|---|
1 | Alabama | 2010 | 4785492 |
521 | Alabama | 2010 | 4785298 |
522 | Alaska | 2010 | 713985 |
2 | Alaska | 2010 | 714031 |
523 | Arizona | 2010 | 6413737 |
3 | Arizona | 2010 | 6408312 |
524 | Arkansas | 2010 | 2921606 |
4 | Arkansas | 2010 | 2921995 |
525 | California | 2010 | 37349363 |
5 | California | 2010 | 37332685 |
526 | Colorado | 2010 | 5049071 |
6 | Colorado | 2010 | 5048644 |
527 | Connecticut | 2010 | 3577073 |
7 | Connecticut | 2010 | 3579899 |
528 | Delaware | 2010 | 899769 |
8 | Delaware | 2010 | 899816 |
529 | District of Columbia | 2010 | 604453 |
9 | District of Columbia | 2010 | 605183 |
530 | Florida | 2010 | 18843326 |
10 | Florida | 2010 | 18849098 |
11 | Georgia | 2010 | 9713521 |
531 | Georgia | 2010 | 9712587 |
532 | Hawaii | 2010 | 1363621 |
12 | Hawaii | 2010 | 1363945 |
13 | Idaho | 2010 | 1571010 |
533 | Idaho | 2010 | 1571450 |
14 | Illinois | 2010 | 12841578 |
534 | Illinois | 2010 | 12843166 |
15 | Indiana | 2010 | 6490528 |
535 | Indiana | 2010 | 6490621 |
... | ... | ... | ... |
558 | Oregon | 2010 | 3838957 |
38 | Oregon | 2010 | 3838048 |
39 | Pennsylvania | 2010 | 12712343 |
559 | Pennsylvania | 2010 | 12709630 |
40 | Rhode Island | 2010 | 1053337 |
560 | Rhode Island | 2010 | 1052886 |
41 | South Carolina | 2010 | 4635943 |
561 | South Carolina | 2010 | 4636312 |
42 | South Dakota | 2010 | 816325 |
562 | South Dakota | 2010 | 816463 |
43 | Tennessee | 2010 | 6356671 |
563 | Tennessee | 2010 | 6356897 |
564 | Texas | 2010 | 25257114 |
44 | Texas | 2010 | 25244310 |
0 | United States | 2010 | 309348193 |
520 | United States | 2010 | 309349689 |
45 | Utah | 2010 | 2775326 |
565 | Utah | 2010 | 2776469 |
566 | Vermont | 2010 | 625960 |
46 | Vermont | 2010 | 625982 |
47 | Virginia | 2010 | 8025773 |
567 | Virginia | 2010 | 8024617 |
48 | Washington | 2010 | 6743226 |
568 | Washington | 2010 | 6744496 |
49 | West Virginia | 2010 | 1854230 |
569 | West Virginia | 2010 | 1853973 |
50 | Wisconsin | 2010 | 5690263 |
570 | Wisconsin | 2010 | 5691047 |
571 | Wyoming | 2010 | 564460 |
51 | Wyoming | 2010 | 564513 |
104 rows × 3 columns
pop_total = pd.concat([pop1_tidy.loc[pop1_tidy['Year'] < 2010], pop2_tidy])
pop_total.head()
State | Year | Population | |
---|---|---|---|
0 | United States | 2000 | 282162411 |
1 | Alabama | 2000 | 4452173 |
2 | Alaska | 2000 | 627963 |
3 | Arizona | 2000 | 5160586 |
4 | Arkansas | 2000 | 2678588 |
pop_total.tail()
State | Year | Population | |
---|---|---|---|
359 | Virginia | 2016 | 8411808 |
360 | Washington | 2016 | 7288000 |
361 | West Virginia | 2016 | 1831102 |
362 | Wisconsin | 2016 | 5778708 |
363 | Wyoming | 2016 | 585501 |
path = os.path.join('Data storage', 'Derived data', 'State population.csv')
pop_total.to_csv(path, index=False)