The proceedings of Australia's Commonwealth Parliament are recorded in Hansard, which is available online through the Parliamentary Library's ParlInfo database. Results in ParlInfo are generated from well-structured XML files which can be downloaded individually from the web interface – one XML file for each sitting day. This notebook shows you how to download the XML files for large scale analysis. It's an updated version of the code I used to harvest Hansard in 2016.
If you just want the data, a full harvest of the XML files for both houses between 1901–1980 and 1998–2005 is available in this repository. XML files are not currently available for 1981 to 1997. Open Australia provides access to Hansard XML files from 2006 onwards.
The XML files are published on the Australian Parliament website under a CC-BY-NC-ND licence.
When you search in ParlInfo, your results point to fragments within a day's procedings. Multiple fragments will be drawn from a single XML file, so there are many more results than there are files. The first step in harvesting the XML files is to work through the results for each year scraping links to the XML files from the HTML pages and discarding any duplicates. The harvest_year()
function below does this. These lists of links are saved as CSV files – one for each house and year. You can view the CSV files in the data
directory.
Once you have a list of XML urls for both houses across all years, you can simply use the urls to download the XML files.
import re
import os
import time
import math
import requests
import requests_cache
import arrow
import pandas as pd
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))
This is where all the harvested data will go.
output_dir = 'data'
os.makedirs(output_dir, exist_ok=True)
These are the basic templates for searches in ParlInfo. Later on we'll insert a date range in the query
slot to filter by year, and increment the page
value to work through the complete set of results.
# Years you want to harvest
# Note that no XML files are available for the years 1981 to 1998, so harvests of this period will fail
START_YEAR = 1901
END_YEAR = 2005
URLS = {
'hofreps': (
'http://parlinfo.aph.gov.au/parlInfo/search/summary/summary.w3p;'
'adv=yes;orderBy=date-eLast;page={page};'
'query={query}%20Dataset%3Ahansardr,hansardr80;resCount=100'),
'senate': (
'http://parlinfo.aph.gov.au/parlInfo/search/summary/summary.w3p;'
'adv=yes;orderBy=date-eLast;page={page};'
'query={query}%20Dataset%3Ahansards,hansards80;resCount=100')
}
def get_total_results(house, query):
'''
Get the total number of results in the search.
'''
# Insert query and page values into the ParlInfo url
url = URLS[house].format(query=query, page=0)
# Get the results page
response = s.get(url)
# Parse the HTML
soup = BeautifulSoup(response.text)
try:
# Find where the total results are given in the HTML
summary = soup.find('div', 'resultsSummary').contents[1].string
# Extract the number of results from the string
total = re.search(r'of (\d+)', summary).group(1)
except AttributeError:
total = 0
return int(total)
def get_xml_url(url):
'''
Extract the XML file url from an individual result.
'''
# Load the page for an individual result
response = s.get(url)
# Parse the HTML
soup = BeautifulSoup(response.text)
# Find the XML url by looking for a pattern in the href
try:
xml_url = soup.find('a', href=re.compile('toc_unixml'))['href']
except TypeError:
xml_url = None
if not response.from_cache:
time.sleep(1)
return xml_url
def harvest_year(house, year):
'''
Loop through a search by house and year, finding all the urls for XML files.
'''
# Format the start and end dates
start_date = '01%2F01%2F{}'.format(year)
end_date = '31%2F12%2F{}'.format(year)
# Prepare the query value using the start and end dates
query = 'Date%3A{}%20>>%20{}'.format(start_date, end_date)
# Get the total results
total_results = get_total_results(house, query)
xml_urls = []
dates = []
found_dates = []
if total_results > 0:
# Calculate the number of pages in the results set
num_pages = int(math.ceil(total_results / 100))
# Loop through the page range
for page in tqdm(range(0, num_pages + 1), desc=str(year), leave=False):
# Get the next page of results
url = URLS[house].format(query=query, page=page)
response = s.get(url)
# Parse the HTML
soup = BeautifulSoup(response.text)
# Find the list of results and loop through them
for result in tqdm(soup.find_all('div', 'resultContent'), leave=False):
# Try to identify the date
try:
date = re.search(r'Date: (\d{2}\/\d{2}\/\d{4})', result.find('div', 'sumMeta').get_text()).group(1)
date = arrow.get(date, 'DD/MM/YYYY').format('YYYY-MM-DD')
except AttributeError:
#There are some dodgy dates -- we'll just ignore them
date = None
# If there's a date, and we haven't seen it already, we'll grab the details
if date and date not in dates:
found_dates.append(date)
# Get the link to the individual result page
# This is where the XML file links live
result_link = result.find('div', 'sumLink').a['href']
# Get the XML file link from the individual record page
xml_url = get_xml_url(result_link)
if xml_url:
dates.append(date)
# Save dates and links
xml_urls.append({'date': date, 'url': 'https://parlinfo.aph.gov.au{}'.format(xml_url)})
if not response.from_cache:
time.sleep(1)
for f_date in list(set(found_dates)):
if f_date not in dates:
xml_urls.append({'date': f_date, 'url': ''})
return xml_urls
for house in ['hofreps', 'senate']:
for year in range(START_YEAR, END_YEAR + 1):
xml_urls = harvest_year(house, year)
df = pd.DataFrame(xml_urls)
df.to_csv(os.path.join(output_dir, '{}-{}-files.csv'.format(house, year)), index=False)
This opens up each house/year list of file links and downloads the XML files. The directory structure is simple:
-- output directory ('data' by default)
-- hofreps
-- 1901
-- XML files...
for house in ['hofreps', 'senate']:
for year in range(START_YEAR, END_YEAR + 1):
output_path = os.path.join(output_dir, house, str(year))
os.makedirs(output_path, exist_ok=True)
df = pd.read_csv(os.path.join(output_dir, '{}-{}-files.csv'.format(house, year)))
for row in tqdm(df.itertuples(), desc=str(year), leave=False):
if pd.notnull(row.url):
filename = re.search(r'(?:%20)*([\w\(\)-]+?\.xml)', row.url).group(1)
# Some of the later files don't include the date in the filename so we'll add it.
if filename[:4] != str(year):
filename = f'{row.date}_{filename}'
filepath = os.path.join(output_path, filename)
if not os.path.exists(filepath):
response = s.get(row.url)
with open(filepath, 'w') as xml_file:
xml_file.write(response.text)
if not response.from_cache:
time.sleep(1)
This just merges all the house/year lists into one big list, adding columns for house and year. It saves the results as a CSV file. This will be useful to analyse things like the number of sitting days per year.
The fields in the CSV file are:
date
– date of sitting day in YYYY-MM-DD formaturl
– url for XML file (where available)year
house
– 'hofreps' or 'senate'Here's the results of my harvest from 1901 to 2005: all-sitting-days.csv
df = pd.DataFrame()
for house in ['hofreps', 'senate']:
for year in range(START_YEAR, END_YEAR + 1):
year_df = pd.read_csv(os.path.join(output_dir, '{}-{}-files.csv'.format(house, year)))
year_df['year'] = year
year_df['house'] = house
df = df.append(year_df)
df.sort_values(by=['house', 'date'], inplace=True)
df.to_csv(os.path.join(output_dir, 'all-sitting-days.csv'), index=False)
For convenience you can zip up each year individually.
from shutil import make_archive
for house in ['hofreps', 'senate']:
xml_path = os.path.join(output_dir, house)
for year in [d for d in os.listdir(xml_path) if d.isnumeric()]:
year_path = os.path.join(xml_path, year)
make_archive(year_path, 'zip', year_path)
Created by Tim Sherratt for the GLAM Workbench.