Notebook

Create large composite images from snipped words¶

This is a variation of the 'scissors & paste' notebook that extracts words from Trove newspaper images and compiles them into messages. In this notebook, you can harvest multiple versions of a list of words and compile them all into one big image.

Slice of composite image

View high-res version

In [17]:

# Import what we need
import os
import random
import time
from datetime import datetime
from io import BytesIO
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from IPython.display import FileLink, display
from PIL import Image, ImageOps
from rectpack import SORT_NONE, newPacker

In [ ]:

%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv

In [ ]:

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

# List of words you want to harvest
WORD_LIST = [
    "newspaper",
    "book",
    "magazine",
    "journal",
    "picture",
    "data",
    "music",
    "map",
    "people",
    "discover",
    "explore",
    "web",
    "research",
    "create",
    "article",
    "history",
]

# Max number of images of each word you want to harvest (sometimes the words can't be found in the article, so the actual number will probably be a little less)
NUM_WORDS = 50

# Where to save the images
IMG_DIR = "words"

# Create the output directory
Path(IMG_DIR).mkdir(parents=True, exist_ok=True)

In [ ]:

def get_word_boxes(article_url):
    """
    Get the boxes around highlighted search terms.
    """
    boxes = []
    # Get the article page
    response = requests.get(article_url)
    # Load in BS4
    soup = BeautifulSoup(response.text, "lxml")
    # Get the id of the newspaper page
    page_id = soup.select("div.zone.onPage")[0]["data-page-id"]
    # Find the highlighted terms
    words = soup.select("span.highlightedTerm")
    # Save the box coords
    for word in words:
        box = {
            "page_id": page_id,
            "left": int(word["data-x"]),
            "top": int(word["data-y"]),
            "width": int(word["data-w"]),
            "height": int(word["data-h"]),
        }
        boxes.append(box)
    return boxes


def crop_word(box, kw, article_id):
    """
    Crop the box coordinates from the full page image.
    """
    word_path = Path(f"{IMG_DIR}/{kw}-{article_id}.jpg")
    if not word_path.exists():
        # Construct the url we need to download the page image
        page_url = (
            "https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
                box["page_id"], 7
            )
        )
        # print(page_url)
        # Download the page image
        response = requests.get(page_url)
        # Open download as an image for editing
        img = Image.open(BytesIO(response.content))
        word = img.crop(
            (
                box["left"] - 5,
                box["top"] - 5,
                box["left"] + box["width"] + 5,
                box["top"] + box["height"] + 5,
            )
        )
        img.close()
        word.save(word_path)


def get_article_from_search(kw):
    """
    Use the Trove API to find articles with the supplied keyword.
    """
    params = {
        "q": f'text:"{kw}"',
        "zone": "newspaper",
        "encoding": "json",
        "n": NUM_WORDS,
        "key": API_KEY,
    }
    response = requests.get("https://api.trove.nla.gov.au/v2/result", params=params)
    data = response.json()
    articles = data["response"]["zone"][0]["records"]["article"]
    for article in articles:
        boxes = []
        try:
            boxes = get_word_boxes(article["troveUrl"])
        except KeyError:
            pass
        if boxes:
            crop_word(boxes[0], kw, article["id"])
        time.sleep(1)

Get all the words¶

In [ ]:

for word in WORD_LIST:
    get_article_from_search(word)

Create the composite image¶

Here we use a packing algorithm to try and fit the little word images (which are a variety of shapes and sizes) into one big box with as few gaps as possible. Adjust the WIDTH and HEIGHT values below to change the size of the composite.

In [ ]:

# Set width of composite image
WIDTH = 2000

# Set height of composite image
HEIGHT = 1000

# Set background colour of composite image
BG_COLOUR = (0, 0, 0)

In [ ]:

def get_image_data():
    images = []
    for im in [i for i in Path(IMG_DIR).glob("*.jpg")]:
        img = Image.open(im)
        h, w = img.size
        images.append((h + 4, w + 4, im.name))
    return images


def pack_images():
    images = get_image_data()
    random.shuffle(images)
    packer = newPacker(sort_algo=SORT_NONE, rotation=False)
    for i in images:
        packer.add_rect(*i)
    packer.add_bin(WIDTH, HEIGHT)
    packer.pack()
    return len(images), packer.rect_list()


def create_composite(output_file=None):
    num_images, rectangles = pack_images()
    comp = Image.new("RGB", (WIDTH, HEIGHT), BG_COLOUR)
    for rect in rectangles:
        b, x, y, w, h, rid = rect
        # print(x,y, w, h, rid)
        word_path = Path(IMG_DIR, rid)
        word = Image.open(word_path)
        word = word.convert("RGB")
        word_with_border = ImageOps.expand(word, border=2, fill=BG_COLOUR)
        comp.paste(word_with_border, (x, y, x + w, y + h))
    if not output_file:
        output_file = (
            f"trove-words-{int(datetime.now().timestamp())}-{WIDTH}-{HEIGHT}.jpg"
        )
    comp.save(output_file)
    print(f"{len(rectangles)} of {num_images} images used")
    display(FileLink(output_file))

Run the cell below to create a composite image of the words you've harvested. The function will tell you how many of the harvested words it was able to fit into the composite. You can adjust the width and height of the composite to fit in more, or fill up gaps.

In [ ]:

create_composite()

Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.