Download a page image

The Trove web interface doesn't provide a way of getting high-resolution page images from newspapers. This simple app lets you download page images as complete, high-resolution JPG files.

In [1]:
import ipywidgets as widgets
import requests
import datetime
import arrow
import random
import re
import shutil
from collections import OrderedDict
from operator import itemgetter
from IPython.display import display, HTML, FileLink, clear_output

titles = {}
out = widgets.Output()

def display_button():
    button = widgets.Button(
        description='Get page image',
        disabled=False,
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click to download',
        icon=''
    )
    button.on_click(get_page_image)
    display(button)

def get_page_image(b):
    out.clear_output()
    # display_button()
    article = None
    page_id = None
    if article_url.value and 'page' in article_url.value:
        page_id = re.search(r'page\/{0,1}(\d+)', article_url.value).group(1)
    elif article_url.value:
        article_id = re.search(r'article\/{0,1}(\d+)', article_url.value).group(1)
        params = {
            'reclevel': 'full',
            'encoding': 'json',
            'key': api_key.value
        }
        response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/{}'.format(article_id), params=params)
        data = response.json()
        article = data['article']
    else:
        end = arrow.get(date.value)
        start = end.shift(days=-1)
        date_query = 'date:[{}Z TO {}Z]'.format(start.format('YYYY-MM-DDT00:00:00'), end.format('YYYY-MM-DDT00:00:00'))

        params = {
            'zone': 'newspaper',
            'reclevel': 'full',
            'encoding': 'json',
            'n': '1',
            'q': '{} firstpageseq:{}'.format(date_query, page.value),
            'l-title': title.value,
            'key': api_key.value
        }
        response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
        data = response.json()
        try:
            article = data['response']['zone'][0]['records']['article'][0]
        except (KeyError, IndexError):
            with out:
                display(HTML('Page not found!'))
    if article:
        page_id = re.search(r'page\/(\d+)', article['trovePageUrl']).group(1)
    if page_id:
        # Construct the url we need to download the image
        page_url = 'http://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(page_id, size.value)
        # Download the page image
        response = requests.get(page_url, stream=True)
        filename = 'data/{}-level{}.jpg'.format(page_id, size.value)
        with open(filename, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        with out:
            display(FileLink(filename))
            display(HTML('<img src="{}">'.format(filename)))
    

def get_titles(b):
    params = {
    'encoding': 'json',
    'key': api_key.value
    }
    response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/titles', params=params)
    data = response.json()
    title_list = [(t['title'], t['id']) for t in data['response']['records']['newspaper']]
    title_list.sort(key=itemgetter(0))
    titles = OrderedDict(title_list)
    title.options = titles

Enter your Trove API key

Get your own Trove API key and enter it below.

In [2]:
api_key = widgets.Text(
    placeholder='Enter your Trove API key',
    description='API key:',
    disabled=False
)
display(api_key)

Either enter an article or page url...

You can use the url in your browser's location bar or an article or page permalink.

In [3]:
article_url = widgets.Text(
    placeholder='Enter an article or page url',
    description='Article/Page:',
    disabled=False
)
display(article_url)

Or provide a date, newspaper title, and page number

If you've provided a url above these settings will be ignored.

In [4]:
date = widgets.DatePicker(
    description='Date:',
    disabled=False
)
display(date)
In [5]:
page = widgets.IntText(
    value=1,
    description='Page:',
    disabled=False
)
display(page)
In [6]:
title = widgets.Dropdown(
        options=['Click the button to load titles'],
        description='Newspaper:',
        disabled=False,
    )
titles_button = widgets.Button(
        description='Load titles',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click to load titles',
        icon=''
    )
titles_button.on_click(get_titles)
display(widgets.HBox([title, titles_button]))

Choose the resolution

Page images are available in seven resolutions that correspond to the zoom levels in the Trove web interface. As a rough guide:

  • size 1 is around 900 x 1200 px (500kb)
  • size 4 is around 2700 x 3500 px (3mb)
  • size 7 is around 6100 x 7800 px (7mb)
In [7]:
size = widgets.BoundedIntText(
    min=1,
    max=7,
    value=4,
    description='Size:',
    disabled=False
)
display(size)

Get the image!

In [8]:
display_button()
display(out)
---------------------------------------------------------------------------
JSONDecodeError                           Traceback (most recent call last)
<ipython-input-1-eca3b956c77d> in get_page_image(b)
     56         }
     57         response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
---> 58         data = response.json()
     59         try:
     60             article = data['response']['zone'][0]['records']['article'][0]

/Volumes/Workspace/mycode/glam-workbench/trove-newspapers/lib/python3.7/site-packages/requests/models.py in json(self, **kwargs)
    896                     # used.
    897                     pass
--> 898         return complexjson.loads(self.text, **kwargs)
    899 
    900     @property

/usr/local/Cellar/python/3.7.7/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    346             parse_int is None and parse_float is None and
    347             parse_constant is None and object_pairs_hook is None and not kw):
--> 348         return _default_decoder.decode(s)
    349     if cls is None:
    350         cls = JSONDecoder

/usr/local/Cellar/python/3.7.7/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/decoder.py in decode(self, s, _w)
    335 
    336         """
--> 337         obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338         end = _w(s, end).end()
    339         if end != len(s):

/usr/local/Cellar/python/3.7.7/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/decoder.py in raw_decode(self, s, idx)
    353             obj, end = self.scan_once(s, idx)
    354         except StopIteration as err:
--> 355             raise JSONDecodeError("Expecting value", s, err.value) from None
    356         return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Created by Tim Sherratt for the GLAM Workbench.