Trove Newspaper & Gazette Harvester

Download large quantities of digitised newspaper and gazette articles from Trove with this simplified, web-based interface to the TroveHarvester tool.

In [ ]:
import time
import argparse
import os
import shutil
import datetime
import arrow
import json
import ipywidgets as widgets
from IPython.display import display, HTML, FileLink, clear_output
from pprint import pprint
import re
import unicodecsv as csv
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
try:
    from urllib.parse import urlparse, parse_qsl
except ImportError:
    from urlparse import urlparse, parse_qsl
# Import everything from the troveharvester package
from troveharvester.__main__ import *
from tqdm.auto import tqdm

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 500, 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
In [ ]:
# These basically replace functions in the TroveHarvester package
# Instead of getting parameters from the command line, they get them from the widgets.

def nb_save_meta(data_dir, harvest):
    '''
    Save the query metadata in a JSON file.
    Useful for documenting your harvest.
    '''
    meta = {}
    meta['query'] = query_url.value
    meta['key'] = api_key.value
    meta['max'] = None
    meta['pdf'] = pdf.value
    meta['text'] = text.value
    meta['image'] = image.value
    meta['harvest'] = harvest
    meta['date_started'] = datetime.datetime.now().isoformat()
    meta['start'] = '*'
    with open(os.path.join(data_dir, 'metadata.json'), 'w') as meta_file:
        json.dump(meta, meta_file, indent=4)

def nb_prepare_harvest(b):
    '''
    Create ouput directories, get parameters from widgets & run the harvest.
    Triggered by the Start button.
    '''
    out.clear_output()
    harvest = str(int(time.time()))  # Get rid of fractions
    data_dir = os.path.join(os.getcwd(), 'data', harvest)
    make_dir(data_dir)
    nb_save_meta(data_dir, harvest)
    if pdf.value == True:
        make_dir(os.path.join(data_dir, 'pdf'))
    if text.value == True:
        make_dir(os.path.join(data_dir, 'text'))
    if image.value == True:
        make_dir(os.path.join(data_dir, 'image'))
    with out:
        nb_start_harvest(data_dir=data_dir, key=api_key.value, query=query_url.value, pdf=pdf.value, text=text.value, image=image.value, start='*', max=None)
        # out.clear_output(wait=True)
        shutil.make_archive(data_dir, 'zip', data_dir)
        display(HTML('<b>Download results</b>'))
        display(FileLink('data/{}.zip'.format(harvest)))
    
def nb_start_harvest(data_dir, key, query, pdf, text, image, start, max):
    '''
    Start a harvest.
    '''
    # Turn the query url into a dictionary of parameters
    params = prepare_query(query, text, key)
    # Create the harvester
    harvester = nb_Harvester(query_params=params, data_dir=data_dir, pdf=pdf, text=text, image=image, start=start, max=max)
    # Go!
    harvester.harvest()

class nb_Harvester(Harvester):
    
        def harvest(self):
            '''
            Start the harvest and loop over the result set until finished.
            '''
            number = self.number
            params = self.query_params.copy()
            params['n'] = self.number
            with out:
                with tqdm(total=self.maximum, unit='article') as pbar:
                    while self.start and (self.harvested < self.maximum):
                        params['s'] = self.start
                        response = s.get(self.api_url, params=params, timeout=30)
                        # print(response.url)
                        try:
                            results = response.json()
                        except (AttributeError, ValueError):
                            # Log errors?
                            pass
                        else:
                            records = results['response']['zone'][0]['records']
                            self.process_results(records, pbar)
                            # pbar.update(len(records['article']))
In [ ]:
# All the UI stuff

# Somewhere to put the results
out = widgets.Output(layout=widgets.Layout(padding='40px'))

api_key = widgets.Text(
    placeholder='Enter your Trove API key',
    description='API key:',
    disabled=False
)

query_url = widgets.Text(
    placeholder='Enter the url of your search',
    description='Query url:',
    disabled=False,
    layout=widgets.Layout(width='100%')
)

text = widgets.Checkbox(
    value=False,
    description='Save full text',
    disabled=False
)

pdf = widgets.Checkbox(
    value=False,
    description='Save PDFs (this can be slow)',
    disabled=False
)

image = widgets.Checkbox(
    value=False,
    description='Save articles as images',
    disabled=False
)

start_button = widgets.Button(
        description='Start harvest',
        disabled=False,
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Start harvest',
        icon=''
    )

start_button.on_click(nb_prepare_harvest)

Enter your Trove API key

The harvester gets its data from the Trove API. To use the API you need a key — the process is quick, painless, and free. Once you have a key, paste it in below.

In [ ]:
display(api_key)

Enter your search query

Use the Trove web interface to construct your search. Remember that the harvester will get all of the matched results, not just the first 2,000 you see in the web interface. Once you're happy with your search, just copy the url and paste it below.

In [ ]:
display(query_url)

Set harvest options

By default the harvester only saves the metadata (date, page, title, newspaper etc) from the search results. If you want to save the full text content of each article, just check the 'Save full text' box. You can also save JPEG and PDF copies of every article by checking the 'Save articles as images' or 'Save PDFs' boxes, but note that this will slow down your harvest and generate large download files. If you want to save images or PDFs from very large harvests, you're probably better off installing and running the harvester on your own computer.

In [ ]:
display(text)
display(pdf)
display(image)
In [ ]:
display(start_button)
display(out)

Once your harvest is complete a link will appear to download the results as a single, zipped file. See this notebook for more information about the contents and format of the results folder.

You can also start to explore your results using this notebook.


Created by [Tim Sherratt](https://timsherratt.org) ([@wragge](https://twitter.com/wragge)) as part of the [GLAM Workbench project](https://github.com/glam-workbench/).

If you think this project is worthwhile you can [support it on Patreon](https://www.patreon.com/timsherratt).