Make composite images from lots of Trove newspaper thumbnails

This notebook starts with a search in Trove's newspapers. It uses the Trove API to work it's way through the search results. For each article it creates a thumbnail image using the code from this notebook. Once this first stage is finished, you have a directory full of lots of thumbnails.

The next stage takes all those thumbnails and pastes them one by one into a BIG image to create a composite, or mosaic.

You'll need to think carefully about the number of results in your search, and the size of the image you want to create. Harvesting all the thumbnails can take a long time.

Also, you need to be able to set a path to a font file, so it's probably best to run this notebook on your local machine rather than in a cloud service, so you have more control over things like font. You might also need to adjust the font size depending on the font you choose.

Some examples:

In [ ]:
import ipywidgets as widgets
import requests
import random
import re
import os
from IPython.display import display, HTML, FileLink, clear_output
from bs4 import BeautifulSoup
from PIL import Image, ImageDraw, ImageFont
from io import BytesIO
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm import tqdm_notebook

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

Set your parameters

Edit the values below as required.

In [ ]:
font_path = '/Library/Fonts/Courier New.ttf'
font_size = 12
# Insert your search query
query = 'title:"white australia policy"'
# Insert your Trove API key
api_key = ''
size = 200 # Size of the thumbnails
cols = 90 # The width of the final image will be cols x size
rows = 55 # The height of the final image will be cols x size

Define some functions

In [ ]:
def get_article_top(article_url):
    '''
    Positional information about the article is attached to each line of the OCR output in data attributes.
    This function loads the HTML version of the article and scrapes the x, y, and width values for the
    top line of text (ie the top of the article).
    '''
    response = requests.get(article_url)
    soup = BeautifulSoup(response.text, 'lxml')
    # Lines of OCR are in divs with the class 'zone'
    # 'onPage' limits to those on the current page
    zones = soup.select('div.zone.onPage')
    # Start with the first element, but...
    top_element = zones[0]
    top_y = int(top_element['data-y'])
    # Illustrations might come after text even if they're above them on the page
    # So loop through the zones to find the element with the lowest 'y' attribute
    for zone in zones:
        if int(zone['data-y']) < top_y:
            top_y = int(zone['data-y'])
            top_element = zone
    top_x = int(top_element['data-x'])
    top_w = int(top_element['data-w'])
    return {'x': top_x, 'y': top_y, 'w': top_w}

def get_thumbnail(article, size, font_path, font_size):
    buffer = 0
    try:
        page_id = re.search(r'page\/(\d+)', article['trovePageUrl']).group(1)
    except (AttributeError, KeyError):
        thumb = None
    else:
        # Get position of top line of article
        article_top = get_article_top(article['troveUrl'])
        # Construct the url we need to download the image
        page_url = 'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(page_id, 7)
        # Download the page image
        response = s.get(page_url, timeout=120)
        # Open download as an image for editing
        img = Image.open(BytesIO(response.content))
        # Use coordinates of top line to create a square box to crop thumbnail
        box = (article_top['x'] - buffer, article_top['y'] - buffer, article_top['x'] + article_top['w'] + buffer, article_top['y'] + article_top['w'] + buffer)
        try:
            # Crop image to create thumb
            thumb = img.crop(box)
        except OSError:
            thumb = None
        else:
            # Resize thumb
            thumb.thumbnail((size, size), Image.ANTIALIAS)
            article_id = 'nla.news-article{}'.format(article['id'])
            fnt = ImageFont.truetype(font_path, 12)
            draw = ImageDraw.Draw(thumb)
            try:
                # Check if RGB
                draw.rectangle([(0, size-10), (size, size)], fill=(255, 255, 255, 255))
                draw.text((0,size-10), article_id, font=fnt, fill=(0, 0, 0, 255))
            except TypeError:
                # Must be grayscale
                draw.rectangle([(0, size-10), (200, 200)], fill=(255))
                draw.text((0,size-10), article_id, font=fnt, fill=(0))
    return thumb
        
def get_total_results(params):
    '''
    Get the total number of results for a search.
    '''
    these_params = params.copy()
    these_params['n'] = 0
    response = s.get('https://api.trove.nla.gov.au/v2/result', params=these_params, timeout=60)
    # print(response.url)
    data = response.json()
    return int(data['response']['zone'][0]['records']['total'])
        
def get_thumbnails(query, api_key, size, font_path, font_size):
    #im = Image.new('RGB', (cols*size, rows*size))
    params = {
        'q': query,
        'zone': 'newspaper',
        'encoding': 'json',
        'bulkHarvest': 'true',
        'n': 100,
        'key': api_key,
        'reclevel': 'full'
    }
    start = '*'
    total = get_total_results(params)
    x = 0
    y = 0
    index = 1
    with tqdm_notebook(total=total) as pbar:
        while start:
            params['s'] = start
            response = s.get('https://api.trove.nla.gov.au/v2/result', params=params, timeout=60)
            data = response.json()
            # The nextStart parameter is used to get the next page of results.
            # If there's no nextStart then it means we're on the last page of results.
            try:
                start = data['response']['zone'][0]['records']['nextStart']
            except KeyError:
                start = None
            for article in data['response']['zone'][0]['records']['article']:
                thumb_file = 'thumbs/{}-nla.news-article{}.jpg'.format(article['date'], article['id'])
                if not os.path.exists(thumb_file):
                    try:
                        # Get page id
                        page_id = re.search(r'page\/(\d+)', article['trovePageUrl']).group(1)
                    except (AttributeError, KeyError):
                         pass
                    else:
                        thumb = get_thumbnail(article, size, font_path, font_size)
                        if thumb:
                            thumb.save(thumb_file)
                pbar.update(1)
    
def create_composite(cols, rows, size):
    im = Image.new('RGB', (cols*size, rows*size))
    thumbs = [t for t in os.listdir('thumbs') if t[-4:] == '.jpg']
    # This will sort by date, comment it out if you don't want that
    # thumbs = sorted(thumbs)
    x = 0
    y = 0
    for index, thumb_file in tqdm_notebook(enumerate(thumbs, 1)):
        thumb = Image.open('thumbs/{}'.format(thumb_file))
        try:
            im.paste(thumb, (x, y, x+size, y+size))
        except ValueError:
            pass
        else:
            if (index % cols) == 0:
                x = 0
                y += size
            else:
                x += size
    im.save('composite-{}-{}.jpg'.format(cols, rows), quality=90)
    

Create all the thumbnails

In [ ]:
get_thumbnails(query, api_key, size, font_path, font_size)

Turn the thumbnails into one big image

In [ ]:
create_composite(cols, rows, size)