By Ben Welsh
This analysis was conducted for April 27, 2019 story "How California’s faltering high-speed rail project was ‘captured’ by costly consultants."
It found that outside consultants have provided more than 3,000 environmental statements, business documents and other reports to the California High-Speed Rail Authority. Altogether they contain more than 152,000 pages.
Import Python tools
import os
import PyPDF2
import pathlib
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
Set output directory where documents will be saved.
output_dir = pathlib.Path(os.getenv("OUTPUT_DIR") or "./output")
Read in a list of all the URLs on the rail authority's site that contain consultant reports.
page_list = open("./input/urls.csv").read().split("\n")
Request each page and parse out all of its PDF links
def parse_page(url):
"""
Parse all the PDF urls from the provided URL's HTML.
"""
r = requests.get(url)
soup = BeautifulSoup(r.text)
a_list = soup.find_all("a")
pdf_list = [a['href'] for a in a_list if a['href'].endswith(".pdf")]
return [f"http://hsr.ca.gov{href}" for href in pdf_list]
pdf_list = []
for page in page_list:
pdfs = parse_page(page)
pdf_list.extend(pdfs)
How many total PDF urls were found?
f"{len(pdf_list):,}"
'3,410'
Remove all of the duplicates.
pdf_set = set(pdf_list)
How many URLs remain?
f"{len(pdf_set):,}"
'3,168'
Download them all.
def download_pdf(url):
"""
Download a PDF url to the output folder.
"""
filename = url.split("/")[-1]
path = output_dir.joinpath(filename)
if path.exists():
return
try:
print(f"Downloading {filename}")
urlretrieve(url, path)
except Exception:
print(f"Failed to download {url}")
for url in pdf_set:
download_pdf(url)
Get their page counts.
def get_page_count(path):
"""
Get the page count of the provided PDF path.
"""
with open(path, 'rb') as f:
try:
pdfReader = PyPDF2.PdfFileReader(f)
return pdfReader.numPages
except:
return pd.np.NaN
path_list = list(output_dir.glob('*.pdf'))
Count the total number of documents again to check out many we actually downloaded.
f"{len(pdf_path_list):,}"
'3,129'
Loop through all the documents and tally pages.
page_counts = dict((p, get_page_count(p)) for p in path_list)
df = pd.DataFrame(pdf_page_counts.items(), columns=["pdf", "page_count"])
df.sort_values("page_count", ascending=False).head()
page_count | ||
---|---|---|
1413 | /media/palewire/Passport/hsr-pdfs/final_EIR_Me... | 8923.0 |
142 | /media/palewire/Passport/hsr-pdfs/final_ERIS_F... | 4297.0 |
2108 | /media/palewire/Passport/hsr-pdfs/BayCValley20... | 2126.0 |
2508 | /media/palewire/Passport/hsr-pdfs/brdmtg_04211... | 1523.0 |
2421 | /media/palewire/Passport/hsr-pdfs/Los_Angeles_... | 1369.0 |
len(df)
3129
f"{df.page_count.sum():,}"
'151,703.0'