#!/usr/bin/env python
# coding: utf-8

# # High-speed-rail document analysis
# 
# By [Ben Welsh](https://palewire/who-is-ben-welsh/)
# 
# This analysis was conducted for April 27, 2019 story ["How California’s faltering high-speed rail project was ‘captured’ by costly consultants."](https://www.latimes.com/local/california/la-me-california-high-speed-rail-consultants-20190426-story.html)
# 
# It found that outside consultants have provided more than 3,000 environmental statements, business documents and other reports to the California High-Speed Rail Authority. Altogether they contain more than 152,000 pages.

# ## How we did it

# Import Python tools

# In[2]:


import os
import PyPDF2
import pathlib
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlretrieve


# Set output directory where documents will be saved.

# In[5]:


output_dir = pathlib.Path(os.getenv("OUTPUT_DIR") or "./output")


# Read in a list of all the URLs on the rail authority's site that contain consultant reports.

# In[23]:


page_list = open("./input/urls.csv").read().split("\n")


# Request each page and parse out all of its PDF links

# In[27]:


def parse_page(url):
    """
    Parse all the PDF urls from the provided URL's HTML.
    """
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    a_list = soup.find_all("a")
    pdf_list = [a['href'] for a in a_list if a['href'].endswith(".pdf")]
    return [f"http://hsr.ca.gov{href}" for href in pdf_list]


# In[28]:


pdf_list = []


# In[31]:


for page in page_list:
    pdfs = parse_page(page)
    pdf_list.extend(pdfs)


# How many total PDF urls were found?

# In[40]:


f"{len(pdf_list):,}"


# Remove all of the duplicates.

# In[41]:


pdf_set = set(pdf_list)


# How many URLs remain?

# In[42]:


f"{len(pdf_set):,}"


# Download them all.

# In[32]:


def download_pdf(url):
    """
    Download a PDF url to the output folder.
    """
    filename = url.split("/")[-1]
    path = output_dir.joinpath(filename)
    if path.exists():
        return
    try:
        print(f"Downloading {filename}")
        urlretrieve(url, path)
    except Exception:
        print(f"Failed to download {url}")            


# In[35]:


for url in pdf_set:
    download_pdf(url)


# Get their page counts.

# In[45]:


def get_page_count(path):
    """
    Get the page count of the provided PDF path.
    """
    with open(path, 'rb') as f:
        try:
            pdfReader = PyPDF2.PdfFileReader(f)
            return pdfReader.numPages
        except:
            return pd.np.NaN


# In[36]:


path_list = list(output_dir.glob('*.pdf'))


# Count the total number of documents again to check out many we actually downloaded.

# In[47]:


f"{len(pdf_path_list):,}"


# Loop through all the documents and tally pages.

# In[38]:


page_counts = dict((p, get_page_count(p)) for p in path_list)


# In[49]:


df = pd.DataFrame(pdf_page_counts.items(), columns=["pdf", "page_count"])


# In[50]:


df.sort_values("page_count", ascending=False).head()


# In[51]:


len(df)


# In[52]:


f"{df.page_count.sum():,}"