#!/usr/bin/env python # coding: utf-8 # # Harvest transcripts # # Harvest all the XML transcripts from the PMs Transcripts site. # # If you don't want to harvest them all yourself, I've created [a repository](https://github.com/wragge/pm-transcripts) containing all the XML files, a CSV-formatted index, and aggregated text and zip files for each prime minister. # In[1]: import requests from bs4 import BeautifulSoup import os from urllib.parse import urlparse import re from tqdm.auto import tqdm import time # This page lists all the XML files TRANSCRIPTS_URL = 'https://pmtranscripts.pmc.gov.au/transcripts.xml' # In[4]: # Get the list of XML files r = requests.get(TRANSCRIPTS_URL) # Turn the XML into Soup soup = BeautifulSoup(r.text, 'lxml') # Get the links to all the XML files uris = soup.find_all('uri') # Loop through all the XML files # Saving each file in turn for uri in tqdm(uris): uri = uri.string uri_bits = urlparse(uri) filename = os.path.basename(uri_bits.path) filepath = os.path.join('transcripts', '{}.xml'.format(filename)) if not os.path.exists(filepath): try: id = re.search('transcript-(\d+)', filename).group(1) xml_url = 'https://pmtranscripts.pmc.gov.au/query?transcript=' + id transcript = requests.get(xml_url) transcript.encoding = 'utf-8' with open(os.path.join('transcripts', '{}.xml'.format(filename)), 'wb') as xml_file: xml_file.write(transcript.text.encode('utf-8')) except AttributeError: pass time.sleep(0.2)