#!/usr/bin/env python
# coding: utf-8

# # Harvest transcripts
# 
# Harvest all the XML transcripts from the PMs Transcripts site.
# 
# If you don't want to harvest them all yourself, I've created [a repository](https://github.com/wragge/pm-transcripts) containing all the XML files, a CSV-formatted index, and aggregated text and zip files for each prime minister.

# In[1]:


import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse
import re
from tqdm.auto import tqdm
import time

# This page lists all the XML files
TRANSCRIPTS_URL = 'https://pmtranscripts.pmc.gov.au/transcripts.xml'


# In[4]:


# Get the list of XML files
r = requests.get(TRANSCRIPTS_URL)

# Turn the XML into Soup
soup = BeautifulSoup(r.text, 'lxml')

# Get the links to all the XML files
uris = soup.find_all('uri')

# Loop through all the XML files
# Saving each file in turn
for uri in tqdm(uris):
    uri = uri.string
    uri_bits = urlparse(uri)
    filename = os.path.basename(uri_bits.path)
    filepath = os.path.join('transcripts', '{}.xml'.format(filename))
    if not os.path.exists(filepath):
        try:
                id = re.search('transcript-(\d+)', filename).group(1)
                xml_url = 'https://pmtranscripts.pmc.gov.au/query?transcript=' + id
                transcript = requests.get(xml_url)
                transcript.encoding = 'utf-8'
                with open(os.path.join('transcripts', '{}.xml'.format(filename)), 'wb') as xml_file:
                    xml_file.write(transcript.text.encode('utf-8'))
        except AttributeError:
            pass
        time.sleep(0.2)