#!/usr/bin/env python
# coding: utf-8

# # Pew Research data on religious affiliation
# 
# Elements of Data Science
# 
# by [Allen Downey](https://allendowney.com)
# 
# [MIT License](https://opensource.org/licenses/MIT)
# 
# Loading and cleaning data from Pew Research Center, Religion and Public Life.
# 
# "[In U.S., Decline of Christianity Continues at Rapid Pace](https://www.pewforum.org/2019/10/17/in-u-s-decline-of-christianity-continues-at-rapid-pace/): An update on America's changing religious landscape", October 17, 2019.
# 
# We can download the tables directly from Pew Research:

# In[1]:


import os

if not os.path.exists('Detailed-Tables-v1-FOR-WEB.pdf'):
    get_ipython().system('wget https://www.pewforum.org/wp-content/uploads/sites/7/2019/10/Detailed-Tables-v1-FOR-WEB.pdf')


# The data are in a table in a PDF document, so I'll use [tabula-py](https://tabula-py.readthedocs.io/en/latest/) to read them.

# In[8]:


import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    get_ipython().system('pip install tabula-py')


# In[9]:


from tabula import read_pdf
import pandas as pd


# I'll read the table from page 1, specifying which column to use as the index and which columns to read.  I'm using data from the Aggregated Political Surveys (columns 3-12) and skipping the Religious Landscape Studies (columns 1-2).

# In[3]:


pages = 1
pandas_options = dict(index_col=0, usecols=[0,3,4,5,6,7,8,9,10,11,12])
df = read_pdf('Detailed-Tables-v1-FOR-WEB.pdf', pages=pages, pandas_options=pandas_options)
df


# Because there are two columns with the heading `2014`, `tabula-py` makes one of them `2014.1`.  So I'll rename it.

# In[4]:


df = df.rename(columns={'2014.1': '2014'})
df


# I transpose the DataFrame so the dates are the index.

# In[5]:


df = df.transpose()
df


# Now I can convert the index from string to int.

# In[6]:


df.index = df.index.astype(int)
df


# And write the result to a file.

# In[7]:


df.to_csv('pew_religion_table1.csv')


# 

# In[ ]: