Aggregate transcripts

Depending on how you want to analyse them, it can be useful to group the transcripts by prime minister.

This notebook aggregates the transcripts in two ways: by extracting the text content of each XML file and combining them into one big text file, and by zipping up the original XML files.

In [21]:
from operator import itemgetter, attrgetter
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
import zipfile

INDEX = "index.csv"

Combine into one big text file for each PM

In [16]:
def combine_pm(pm, release_type=None):
    '''
    Extract text from the XML files for the specified PM and combine into one big text file.
    Can be filtered by 'release_type'.
    '''
    os.makedirs('pms', exist_ok=True)
    transcripts = []
    df = pd.read_csv(INDEX, keep_default_na=False)
    if release_type:
        rows = df.loc[(df['pm'] == pm) & (df['release_type'] == release_type)]
    else:
        rows = df.loc[(df['pm'] == pm)]
    transcript_ids = rows.sort_values(by='date')['id'].to_list()
    filename = pm.lower().replace(', ', '-')
    if release_type:
        filename = '{}-{}'.format(filename, release_type.lower())
    with open(os.path.join('pms', filename + '.txt'), 'w') as pm_file:
        for t_id in transcript_ids:
            with open(os.path.join('transcripts', 'transcript-{}.xml'.format(t_id)), 'rb') as xml_file:
                soup = BeautifulSoup(xml_file, 'xml')
                content = soup.find('content').string.replace('<![CDATA[', '').replace(']]>', '')
                clean_content = re.sub('<[^<]+?>', '', content)
                pm_file.write(clean_content + '\n\n')

def combine_all_pms(type=None):
    df = pd.read_csv(INDEX, keep_default_na=False)
    pms = [pm for pm in pd.unique(df['pm']) if pm != '']
    for pm in pms:
        combine_pm(pm, type)
In [17]:
combine_all_pms()
In [18]:
# Just get the speeches
combine_all_pms('Speech')

Zip up the transcripts for each PM

In [23]:
def zip_pm(pm):
    os.makedirs('pms', exist_ok=True)
    filename = os.path.join('pms', '{}.zip'.format(pm.lower().replace(', ', '-')))
    transcript_ids = df.loc[(df['pm'] == pm)]['id'].to_list()
    zf = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED)
    for t_id in transcript_ids:
        t_file = 'transcript-{}.xml'.format(t_id)
        t_path = os.path.join('transcripts', t_file)
        zf.write(t_path, t_file)
    zf.close()
    
def zip_all_pms():
    df = pd.read_csv(INDEX, keep_default_na=False)
    pms = [pm for pm in pd.unique(df['pm']) if pm != '']
    for pm in pms:
        zip_pm(pm)
In [24]:
zip_all_pms()
/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/zipfile.py:1470: UserWarning: Duplicate name: 'transcript-31836.xml'
  return self._open_to_write(zinfo, force_zip64=force_zip64)