Notebook

Make composite images from lots of Trove newspaper thumbnails¶

This notebook starts with a search in Trove's newspapers. It uses the Trove API to work it's way through the search results. For each article it creates a thumbnail image using the code from this notebook. Once this first stage is finished, you have a directory full of lots of thumbnails.

The next stage takes all those thumbnails and pastes them one by one into a BIG image to create a composite, or mosaic.

You'll need to think carefully about the number of results in your search, and the size of the image you want to create. Harvesting all the thumbnails can take a long time.

Also, you need to be able to set a path to a font file, so it's probably best to run this notebook on your local machine rather than in a cloud service, so you have more control over things like font. You might also need to adjust the font size depending on the font you choose.

Some examples:

In [ ]:

import os
import re
from io import BytesIO
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from PIL import Image, ImageDraw, ImageFont
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

Path("thumbs").mkdir(exist_ok=True)

In [ ]:

%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv

Set your parameters¶

Edit the values below as required.

In [ ]:

font_path = "/Library/Fonts/Courier New.ttf"
font_path = "/usr/share/fonts/truetype/freefont/FreeMono.ttf"
font_size = 12
# Insert your search query below
query = 'title:"white australia policy" date:[1960 TO 1969]'

size = 200  # Size of the thumbnails
cols = 90  # The width of the final image will be cols x size
rows = 55  # The height of the final image will be cols x size

# Insert your Trove API key
api_key = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    api_key = os.getenv("TROVE_API_KEY")

Define some functions¶

In [ ]:

def get_article_top(article_url):
    """
    Positional information about the article is attached to each line of the OCR output in data attributes.
    This function loads the HTML version of the article and scrapes the x, y, and width values for the
    top line of text (ie the top of the article).
    """
    response = requests.get(article_url)
    soup = BeautifulSoup(response.text, "lxml")
    # Lines of OCR are in divs with the class 'zone'
    # 'onPage' limits to those on the current page
    zones = soup.select("div.zone.onPage")
    # Start with the first element, but...
    top_element = zones[0]
    top_y = int(top_element["data-y"])
    # Illustrations might come after text even if they're above them on the page
    # So loop through the zones to find the element with the lowest 'y' attribute
    for zone in zones:
        if int(zone["data-y"]) < top_y:
            top_y = int(zone["data-y"])
            top_element = zone
    top_x = int(top_element["data-x"])
    top_w = int(top_element["data-w"])
    return {"x": top_x, "y": top_y, "w": top_w}


def get_thumbnail(article, size, font_path, font_size):
    buffer = 0
    try:
        page_id = re.search(r"page\/(\d+)", article["trovePageUrl"]).group(1)
    except (AttributeError, KeyError):
        thumb = None
    else:
        # Get position of top line of article
        article_top = get_article_top(article["troveUrl"])
        # Construct the url we need to download the image
        page_url = (
            "https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
                page_id, 7
            )
        )
        # Download the page image
        response = s.get(page_url, timeout=120)
        # Open download as an image for editing
        img = Image.open(BytesIO(response.content))
        # Use coordinates of top line to create a square box to crop thumbnail
        box = (
            article_top["x"] - buffer,
            article_top["y"] - buffer,
            article_top["x"] + article_top["w"] + buffer,
            article_top["y"] + article_top["w"] + buffer,
        )
        try:
            # Crop image to create thumb
            thumb = img.crop(box)
        except OSError:
            thumb = None
        else:
            # Resize thumb
            thumb.thumbnail((size, size), Image.ANTIALIAS)
            article_id = "nla.news-article{}".format(article["id"])
            fnt = ImageFont.truetype(font_path, 12)
            draw = ImageDraw.Draw(thumb)
            try:
                # Check if RGB
                draw.rectangle(
                    [(0, size - 12), (size, size)], fill=(255, 255, 255, 255)
                )
                draw.text((0, size - 12), article_id, font=fnt, fill=(0, 0, 0, 255))
            except TypeError:
                # Must be grayscale
                draw.rectangle([(0, size - 12), (200, 200)], fill=(255))
                draw.text((0, size - 12), article_id, font=fnt, fill=(0))
    return thumb


def get_total_results(params):
    """
    Get the total number of results for a search.
    """
    these_params = params.copy()
    these_params["n"] = 0
    response = s.get(
        "https://api.trove.nla.gov.au/v2/result", params=these_params, timeout=60
    )
    # print(response.url)
    data = response.json()
    return int(data["response"]["zone"][0]["records"]["total"])


def get_thumbnails(query, api_key, size, font_path, font_size):
    # im = Image.new('RGB', (cols*size, rows*size))
    params = {
        "q": query,
        "zone": "newspaper",
        "encoding": "json",
        "bulkHarvest": "true",
        "n": 100,
        "key": api_key,
        "reclevel": "full",
    }
    start = "*"
    total = get_total_results(params)
    with tqdm(total=total) as pbar:
        while start:
            params["s"] = start
            response = s.get(
                "https://api.trove.nla.gov.au/v2/result", params=params, timeout=60
            )
            data = response.json()
            # The nextStart parameter is used to get the next page of results.
            # If there's no nextStart then it means we're on the last page of results.
            try:
                start = data["response"]["zone"][0]["records"]["nextStart"]
            except KeyError:
                start = None
            for article in data["response"]["zone"][0]["records"]["article"]:
                thumb_file = "thumbs/{}-nla.news-article{}.jpg".format(
                    article["date"], article["id"]
                )
                if not os.path.exists(thumb_file):
                    thumb = get_thumbnail(article, size, font_path, font_size)
                    if thumb:
                        thumb.save(thumb_file)
                pbar.update(1)


def create_composite(cols, rows, size):
    im = Image.new("RGB", (cols * size, rows * size))
    thumbs = [t for t in os.listdir("thumbs") if t[-4:] == ".jpg"]
    # This will sort by date, comment it out if you don't want that
    # thumbs = sorted(thumbs)
    x = 0
    y = 0
    for index, thumb_file in tqdm(enumerate(thumbs, 1)):
        thumb = Image.open("thumbs/{}".format(thumb_file))
        try:
            im.paste(thumb, (x, y, x + size, y + size))
        except ValueError:
            pass
        else:
            if (index % cols) == 0:
                x = 0
                y += size
            else:
                x += size
    im.save("composite-{}-{}.jpg".format(cols, rows), quality=90)

Create all the thumbnails¶

In [ ]:

get_thumbnails(query, api_key, size, font_path, font_size)

Turn the thumbnails into one big image¶

In [ ]:

create_composite(cols, rows, size)

Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.