#!/usr/bin/env python # coding: utf-8 # # Pew Research data on religious affiliation # # Elements of Data Science # # by [Allen Downey](https://allendowney.com) # # [MIT License](https://opensource.org/licenses/MIT) # # Loading and cleaning data from Pew Research Center, Religion and Public Life. # # "[In U.S., Decline of Christianity Continues at Rapid Pace](https://www.pewforum.org/2019/10/17/in-u-s-decline-of-christianity-continues-at-rapid-pace/): An update on America's changing religious landscape", October 17, 2019. # # We can download the tables directly from Pew Research: # In[1]: import os if not os.path.exists('Detailed-Tables-v1-FOR-WEB.pdf'): get_ipython().system('wget https://www.pewforum.org/wp-content/uploads/sites/7/2019/10/Detailed-Tables-v1-FOR-WEB.pdf') # The data are in a table in a PDF document, so I'll use [tabula-py](https://tabula-py.readthedocs.io/en/latest/) to read them. # In[8]: import sys IN_COLAB = 'google.colab' in sys.modules if IN_COLAB: get_ipython().system('pip install tabula-py') # In[9]: from tabula import read_pdf import pandas as pd # I'll read the table from page 1, specifying which column to use as the index and which columns to read. I'm using data from the Aggregated Political Surveys (columns 3-12) and skipping the Religious Landscape Studies (columns 1-2). # In[3]: pages = 1 pandas_options = dict(index_col=0, usecols=[0,3,4,5,6,7,8,9,10,11,12]) df = read_pdf('Detailed-Tables-v1-FOR-WEB.pdf', pages=pages, pandas_options=pandas_options) df # Because there are two columns with the heading `2014`, `tabula-py` makes one of them `2014.1`. So I'll rename it. # In[4]: df = df.rename(columns={'2014.1': '2014'}) df # I transpose the DataFrame so the dates are the index. # In[5]: df = df.transpose() df # Now I can convert the index from string to int. # In[6]: df.index = df.index.astype(int) df # And write the result to a file. # In[7]: df.to_csv('pew_religion_table1.csv') # # In[ ]: