#!/usr/bin/env python # coding: utf-8 # # High-speed-rail document analysis # # By [Ben Welsh](https://palewire/who-is-ben-welsh/) # # This analysis was conducted for April 27, 2019 story ["How California’s faltering high-speed rail project was ‘captured’ by costly consultants."](https://www.latimes.com/local/california/la-me-california-high-speed-rail-consultants-20190426-story.html) # # It found that outside consultants have provided more than 3,000 environmental statements, business documents and other reports to the California High-Speed Rail Authority. Altogether they contain more than 152,000 pages. # ## How we did it # Import Python tools # In[2]: import os import PyPDF2 import pathlib import requests import pandas as pd from bs4 import BeautifulSoup from urllib.request import urlretrieve # Set output directory where documents will be saved. # In[5]: output_dir = pathlib.Path(os.getenv("OUTPUT_DIR") or "./output") # Read in a list of all the URLs on the rail authority's site that contain consultant reports. # In[23]: page_list = open("./input/urls.csv").read().split("\n") # Request each page and parse out all of its PDF links # In[27]: def parse_page(url): """ Parse all the PDF urls from the provided URL's HTML. """ r = requests.get(url) soup = BeautifulSoup(r.text) a_list = soup.find_all("a") pdf_list = [a['href'] for a in a_list if a['href'].endswith(".pdf")] return [f"http://hsr.ca.gov{href}" for href in pdf_list] # In[28]: pdf_list = [] # In[31]: for page in page_list: pdfs = parse_page(page) pdf_list.extend(pdfs) # How many total PDF urls were found? # In[40]: f"{len(pdf_list):,}" # Remove all of the duplicates. # In[41]: pdf_set = set(pdf_list) # How many URLs remain? # In[42]: f"{len(pdf_set):,}" # Download them all. # In[32]: def download_pdf(url): """ Download a PDF url to the output folder. """ filename = url.split("/")[-1] path = output_dir.joinpath(filename) if path.exists(): return try: print(f"Downloading {filename}") urlretrieve(url, path) except Exception: print(f"Failed to download {url}") # In[35]: for url in pdf_set: download_pdf(url) # Get their page counts. # In[45]: def get_page_count(path): """ Get the page count of the provided PDF path. """ with open(path, 'rb') as f: try: pdfReader = PyPDF2.PdfFileReader(f) return pdfReader.numPages except: return pd.np.NaN # In[36]: path_list = list(output_dir.glob('*.pdf')) # Count the total number of documents again to check out many we actually downloaded. # In[47]: f"{len(pdf_path_list):,}" # Loop through all the documents and tally pages. # In[38]: page_counts = dict((p, get_page_count(p)) for p in path_list) # In[49]: df = pd.DataFrame(pdf_page_counts.items(), columns=["pdf", "page_count"]) # In[50]: df.sort_values("page_count", ascending=False).head() # In[51]: len(df) # In[52]: f"{df.page_count.sum():,}"