Convert a year's worth of Historic Hansard into a dataframe for analysis

This notebook analyses Commonwealth Hansard XML files from this GitHub repository. Give it a year (between 1901 and 1980), and a house (either 'hofreps' or 'senate'), and it will download all the proceedings of that year and house, extract some basic data about debates and speeches, and provide the results as a dataframe for exploration.

In [3]:
import requests
import requests_cache
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
import arrow
import pandas as pd
import altair as alt

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

Note that the GitHub API only allows 60 unauthorised requests per hour. So it's a good idea to cache things. Note that requests to download files aren't included in the API tally. If you need more requests you'll need to use authentication.

In [4]:
API_URL = 'https://api.github.com/repos/wragge/hansard-xml/contents'

Set the year and house you're interested in.

In [5]:
year = '1901' # 1901 to 1980
house = 'hofreps' # hofreps or senate
In [6]:
def count_words(para):
    '''
    Count the number of words in an element.
    '''
    words = 0
    for string in para.stripped_strings:
        words += len(string.split())
    return words

def get_paras(section):
    '''
    Find all the para type containers in an element and count the total number of words.
    '''
    words = 0
    for para in section.find_all(['para', 'quote', 'list'], recursive=False):
        words += count_words(para)
    return words

def get_words_in_speech(start, speech):
    '''
    Get the top-level containers in a speech and find the total number of words across them all.
    '''
    words = 0
    words += get_paras(start)
    words += get_paras(speech)
    for cont in speech.find_all('continue', recursive=False):
        cont_start = cont.find('talk.start', recursive=False)
        words += get_paras(cont_start)
        words += get_paras(cont)
    return words
                            
def get_interjections(speech):
    '''
    Get details of any interjections within a speech.
    '''
    speeches = []
    for index, intj in enumerate(speech.find_all('interjection', recursive=False)):
        start = intj.find('talk.start', recursive=False)
        speaker = start.find('talker')
        name = speaker.find('name', role='metadata').string
        id = speaker.find('name.id').string
        words = get_words_in_speech(start, intj)
        speeches.append({'interjection_idx': index, 'speaker': name, 'id': id, 'type': intj.name, 'words': words})
    return speeches     

def get_speeches(debate):
    '''
    Get details of any speeches in a debate (or subdebate)
    '''
    speeches = []
    for index, speech in enumerate(debate.find_all(['speech', 'question', 'answer'], recursive=False)):
        start = speech.find('talk.start', recursive=False)
        speaker = start.find('talker')
        name = speaker.find('name', role='metadata').string
        id = speaker.find('name.id').string
        words = get_words_in_speech(start, speech)
        speeches.append({'speech_idx': index, 'speaker': name, 'id': id, 'type': speech.name, 'words': words})
        # Interjections are within a speech
        interjections = get_interjections(speech)
        # Tag interjections with the speech index
        for intj in interjections:
            intj['speech_idx'] = index
            speeches.append(intj)
    return speeches

def get_subdebates(debate):
    '''
    Get details of any subdebates within a debate.
    '''
    speeches = []
    for index, sub in enumerate(debate.find_all('subdebate.1', recursive=False)):
        subdebate_info = {'subdebate_title': sub.subdebateinfo.title.string, 'subdebate_idx': index}
        new_speeches = get_speeches(sub)
        # Add the subdebate info to the speech
        for sp in new_speeches:
            sp.update(subdebate_info)
        speeches += new_speeches
    return speeches

def get_debates(soup):
    '''
    Get details of all the debates in day's proceedings.
    '''
    speeches = []
    date = soup.find('session.header').date.string
    for index, debate in enumerate(soup.find_all('debate')):
        debate_info = {
            'date': date,
            'debate_title': debate.debateinfo.title.string,
            'debate_type': debate.debateinfo.type.string,
            'debate_idx': index
        }
        new_speeches = get_subdebates(debate)
        new_speeches += get_speeches(debate)
        # Add the debate info to the speech
        for sp in new_speeches:
            sp.update(debate_info)
        speeches += new_speeches
    return speeches

def summarise_year(year, house):
    '''
    Get each day's proceedings for the supplied year/house and extract information about debates and speeches.
    '''
    speeches = []
    response = s.get(f'{API_URL}/{house}/{year}')
    data = response.json()
    files = [f for f in data if f['type'] == 'file']
    for f in tqdm(files):
        response = s.get(f['download_url'])
        soup = BeautifulSoup(response.text)
        speeches += get_debates(soup)
    df = pd.DataFrame(speeches)
    return df
In [7]:
df = summarise_year(year=year, house=house)

In [8]:
df.head()
Out[8]:
speech_idx speaker id type words subdebate_title subdebate_idx date debate_title debate_type debate_idx interjection_idx
0 0 MACDONALD-PATERSON, Thomas KIQ speech 318 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION Questions 9 NaN
1 1 BRADDON, Edward JRR speech 178 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION Questions 9 NaN
2 2 SMITH, Arthur KTT speech 693 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION Questions 9 NaN
3 2 CHAPMAN, Austin JX7 interjection 9 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION Questions 9 0.0
4 3 CAMERON, Donald Norman JUJ speech 98 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION Questions 9 NaN

Who made the most speeches?

In [9]:
df.loc[df['type'] == 'speech']['speaker'].value_counts()[:20]
Out[9]:
BARTON, Edmund            439
KINGSTON, Charles         303
MCMILLAN, William         215
DEAKIN, Alfred            204
CONROY, Alfred            180
PIESSE, Frederick         166
THOMSON, Dugald           153
WATSON, John Christian    150
REID, George              146
ISAACS, Isaac             146
GLYNN, Patrick            140
SPEAKER, Mr               140
CROUCH, Richard           136
O'MALLEY, King            119
MCCAY, James              118
MCEACHARN, Malcolm        115
MAUGER, Samuel            109
LYNE, William             108
POYNTON, Alexander        108
TURNER, George            107
Name: speaker, dtype: int64

Who made the most interjections?

In [10]:
df.loc[df['type'] == 'interjection']['speaker'].value_counts()[:20]
Out[10]:
KINGSTON, Charles         1257
DEAKIN, Alfred            1097
BARTON, Edmund            1001
TURNER, George             906
REID, George               801
MCMILLAN, William          775
MAUGER, Samuel             604
LYNE, William              551
WATSON, John Christian     550
COOK, Joseph               536
HIGGINS, Henry             535
ISAACS, Isaac              482
MCEACHARN, Malcolm         429
THOMSON, Dugald            391
CONROY, Alfred             355
MCCAY, James               355
FORREST, John              332
SOLOMON, Vaiben            321
POYNTON, Alexander         300
MCDONALD, Charles          284
Name: speaker, dtype: int64

Who spoke the most words?

In [11]:
df.groupby(by='speaker')['words'].sum().to_frame().reset_index().sort_values('words', ascending=False)[:20]
Out[11]:
speaker words
2 BARTON, Edmund 201547
65 REID, George 140732
55 MCMILLAN, William 138382
41 KINGSTON, Charles 132851
74 SPEAKER, Mr 128840
78 THOMSON, Dugald 112445
18 DEAKIN, Alfred 104408
82 WATSON, John Christian 99848
49 MCCAY, James 98219
12 CONROY, Alfred 97755
80 TURNER, George 94780
21 EDWARDS, George 93070
39 ISAACS, Isaac 91439
35 HIGGINS, Henry 90842
64 QUICK, John 88777
14 COOK, Joseph 88317
62 PIESSE, Frederick 86988
73 SOLOMON, Vaiben 86977
29 GLYNN, Patrick 83018
84 WILKS, William 81424

Which debates generated the most words?

Note that there's variation in the way debate titles were recorded, and in the OCR results, so this sort of grouping isn't always going to work. To get something more accurate, you'd have to do some normalisation of debate titles first.

In [12]:
df.groupby(by=['debate_title'])['words'].sum().to_frame().reset_index().sort_values('words', ascending=False)[:20]
Out[12]:
debate_title words
111 QUESTION 1084980
74 MOTION OF CENSURE 488836
96 POST AND TELEGRAPH BILL 334188
31 CUSTOMS BILL 303111
58 IMMIGRATION RESTRICTION BILL 301900
109 PUBLIC SERVICE BILL 260357
135 TARIFF 174766
35 DEFENCE BILL 136801
129 SUPPLY BILL 92487
81 PACIFIC ISLANDS LABOURERS BILL 86791
24 COMMONWEALTH PUBLIC SERVICE BILL 86225
38 DISTILLATION BILL 83656
5 ADJOURNMENT 79587
18 BRITISH NEW GUINEA 64175
60 INTER-STATE COMMISSION BILL 50156
55 GOVERNOR-GENERAL'S SPEECH 47909
125 STATEMENT SHOWING THE AMOUNTS RECEIVE!) BV WHI... 44751
19 BUDGET 43095
105 PROPERTY FOR PUBLIC PURPOSES ACQUISITION BILL 34402
0 ACTS INTERPRETATION BILL 33693

How many words were spoken each day of proceedings?

I've only included words in speeches with identified speakers (including interjections), so some procedural content might not be included in the totals.

In [13]:
words_per_day = df.groupby(by=['date'])['words'].sum().to_frame().reset_index()
alt.Chart(words_per_day).mark_bar(size=2).encode(
    x='date:T',
    y='words:Q',
    tooltip=['date:T', 'words:Q']
).properties(width=700)
Out[13]:
In [14]:
df.loc[(df['debate_type'] == 'Questions') | (df['debate_title'] == 'QUESTION') | (df['type'] == 'question')]['subdebate_title'].value_counts()[:20]
Out[14]:
TARIFF                                    1161
THE TARIFF                                 887
THE GOVERNOR-GENERAL'S SPEECH              415
MOTION OFCENSURE                           347
G OVERNOR - GENERAL'S SPEECH               277
WEAVERS' PRICES AT THE ANTIPODES           259
SUPPLY                                     232
GOVERNOR-GENERAL'S SPEECH                  219
EDMUND BARTON                              193
WESTERN AUSTRALIAN MONEYORDER OFFICERS     167
EMOLUMENTS OF MINISTERS                    166
JOHN JOSEPH EA STICK                       125
THEGOVERNOR-GENERAL'S SPEECH                64
OLD-AGE PENSIONS                            63
WAYS AND MEANS                              59
RATE OF WAGE : HOURS OF LABOUR              41
FEDERAL CAPITAL SITE                        33
THIRD SCHEDULE                              32
ADDITIONAL SITTING DAY                      27
DEPARTMENT OF AGRICULTURE                   24
Name: subdebate_title, dtype: int64

Created by Tim Sherratt for the GLAM Workbench.