#!/usr/bin/env python # coding: utf-8 # # Combine data files with state populations # The first data file has 2000-2010 # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np import os from os.path import join cwd = os.getcwd() data_directory = join(cwd, '..', 'Data storage') # ## Import 2000-2010 data # The sex, origin, race, and age columns are ALL when they have values of 0 # # Not clear if these are beginning or end of year values. # # https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/ # In[3]: path = os.path.join(data_directory, 'Population data', 'st-est00int-alldata.csv') pop1 = pd.read_csv(path) # In[4]: pop1.head() # In[5]: pop1 = pop1.loc[(pop1['SEX'] == 0) & (pop1['ORIGIN'] == 0) & (pop1['RACE'] == 0) & (pop1['AGEGRP'] == 0), :] # In[6]: # Column names for population estimate est_cols = ['POPESTIMATE{}'.format(x) for x in range(2000, 2011)] # In[7]: # Melt the wide-form data into a tidy dataframe pop1_tidy = pd.melt(pop1, id_vars='NAME', value_vars=est_cols, var_name='Year', value_name='Population') # In[8]: pop1_tidy.head() # In[9]: def map_year(x): 'Return last 4 characters (the year)' year = x[-4:] return int(year) pop1_tidy['Year'] = pop1_tidy['Year'].map(map_year) # The values shown below are ever slightly different than those listed in the later dataset. # In[10]: pop1_tidy.loc[pop1_tidy['Year'] == 2010].head() # In[11]: pop1_tidy.head() # In[12]: pop1_tidy.tail() # In[13]: pop1_tidy.columns = ['State', 'Year', 'Population'] # ## Import 2010-2016 data # https://www.census.gov/data/tables/2016/demo/popest/state-total.html # In[14]: path = os.path.join(data_directory, 'Population data', 'nst-est2016-01.xlsx') pop2 = pd.read_excel(path, header=3, parse_cols='A, D:J', skip_footer=7) # In[15]: pop2.head() # In[16]: pop2.tail() # In[17]: drop_rows = ['Northeast', 'Midwest', 'South', 'West'] pop2.drop(drop_rows, inplace=True) # In[19]: pop2.index = pop2.index.str.strip('.') # In[20]: pop2.head() # In[21]: pop2.columns # In[22]: pop2_tidy = pd.melt(pop2.reset_index(), id_vars='index', value_vars=range(2010, 2017), value_name='Population', var_name='Year') pop2_tidy.columns = ['State', 'Year', 'Population'] # ## Combine data # In[23]: pop_total = pd.concat([pop1_tidy, pop2_tidy]) # The overlapping 2010 values are different, but just barely. I'm going to re-combine the datasets and keep values from the second dataset. # In[24]: pop_total.loc[pop_total['Year']==2010].sort_values('State') # In[25]: pop_total = pd.concat([pop1_tidy.loc[pop1_tidy['Year'] < 2010], pop2_tidy]) # In[26]: pop_total.head() # In[27]: pop_total.tail() # In[81]: path = os.path.join('Data storage', 'Derived data', 'State population.csv') pop_total.to_csv(path, index=False)