#!/usr/bin/env python
# coding: utf-8

# # Combine data files with state populations
# The first data file has 2000-2010

# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from os.path import join
cwd = os.getcwd()
data_directory = join(cwd, '..', 'Data storage')


# ## Import 2000-2010 data
# The sex, origin, race, and age columns are ALL when they have values of 0
# 
# Not clear if these are beginning or end of year values.
# 
# https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/

# In[3]:


path = os.path.join(data_directory, 'Population data',
                    'st-est00int-alldata.csv')
pop1 = pd.read_csv(path)


# In[4]:


pop1.head()


# In[5]:


pop1 = pop1.loc[(pop1['SEX'] == 0) &
                (pop1['ORIGIN'] == 0) &
                (pop1['RACE'] == 0) &
                (pop1['AGEGRP'] == 0), :]


# In[6]:


# Column names for population estimate
est_cols = ['POPESTIMATE{}'.format(x) for x in range(2000, 2011)]


# In[7]:


# Melt the wide-form data into a tidy dataframe
pop1_tidy = pd.melt(pop1, id_vars='NAME',
                    value_vars=est_cols, var_name='Year',
                    value_name='Population')


# In[8]:


pop1_tidy.head()


# In[9]:


def map_year(x):
    'Return last 4 characters (the year)'
    year = x[-4:]
    return int(year)

pop1_tidy['Year'] = pop1_tidy['Year'].map(map_year)


# The values shown below are ever slightly different than those listed in the later dataset.

# In[10]:


pop1_tidy.loc[pop1_tidy['Year'] == 2010].head()


# In[11]:


pop1_tidy.head()


# In[12]:


pop1_tidy.tail()


# In[13]:


pop1_tidy.columns = ['State', 'Year', 'Population']


# ## Import 2010-2016 data
# https://www.census.gov/data/tables/2016/demo/popest/state-total.html

# In[14]:


path = os.path.join(data_directory, 'Population data', 'nst-est2016-01.xlsx')
pop2 = pd.read_excel(path, header=3, parse_cols='A, D:J', skip_footer=7)


# In[15]:


pop2.head()


# In[16]:


pop2.tail()


# In[17]:


drop_rows = ['Northeast', 'Midwest', 'South', 'West']
pop2.drop(drop_rows, inplace=True)


# In[19]:


pop2.index = pop2.index.str.strip('.')


# In[20]:


pop2.head()


# In[21]:


pop2.columns


# In[22]:


pop2_tidy = pd.melt(pop2.reset_index(), id_vars='index',
                    value_vars=range(2010, 2017), value_name='Population',
                    var_name='Year')
pop2_tidy.columns = ['State', 'Year', 'Population']


# ## Combine data

# In[23]:


pop_total = pd.concat([pop1_tidy, pop2_tidy])


# The overlapping 2010 values are different, but just barely. I'm going to re-combine the datasets and keep values from the second dataset.

# In[24]:


pop_total.loc[pop_total['Year']==2010].sort_values('State')


# In[25]:


pop_total = pd.concat([pop1_tidy.loc[pop1_tidy['Year'] < 2010], pop2_tidy])


# In[26]:


pop_total.head()


# In[27]:


pop_total.tail()


# In[81]:


path = os.path.join('Data storage', 'Derived data', 'State population.csv')
pop_total.to_csv(path, index=False)