Create large composite images from snipped words

This is a variation of the 'scissors & paste' notebook that extracts words from Trove newspaper images and compiles them into messages. In this notebook, you can harvest multiple versions of a list of words and compile them all into one big image.

Slice of composite image

View high-res version

In [ ]:
API_KEY = 'YOUR TROVE API KEY'

# List of words you want to harvest
WORD_LIST = ['newspaper', 'book', 'magazine', 'journal', 'picture', 'data', 'music', 'map', 'people', 'discover', 'explore', 'web']

# Max number of images of each word you want to harvest (sometimes the words can't be found in the article, so the actual number will probably be a little less)
NUM_WORDS = 50

# Where to save the images
IMG_DIR = 'words'
In [ ]:
# Import what we need
import requests
from bs4 import BeautifulSoup
from PIL import Image, ImageOps
from io import BytesIO
import base64
import time
from pathlib import Path
from rectpack import *
from datetime import datetime
from IPython.display import display, FileLink

# Create the output directory
Path(IMG_DIR).mkdir(parents=True, exist_ok=True)
In [ ]:
def get_word_boxes(article_url):
    '''
    Get the boxes around highlighted search terms.
    '''
    boxes = []
    # Get the article page
    response = requests.get(article_url)
    # Load in BS4
    soup = BeautifulSoup(response.text, 'lxml')
    # Get the id of the newspaper page
    page_id = soup.select('div.zone.onPage')[0]['data-page-id']
    # Find the highlighted terms
    words = soup.select('span.highlightedTerm')
    # Save the box coords
    for word in words:
        box = {
            'page_id': page_id,
            'left': int(word['data-x']),
            'top': int(word['data-y']),
            'width': int(word['data-w']),
            'height': int(word['data-h'])
        }
        boxes.append(box)
    return boxes
    
def crop_word(box, kw, article_id):
    '''
    Crop the box coordinates from the full page image.
    '''
    word_path = Path(f'{IMG_DIR}/{kw}-{article_id}.jpg')
    if not word_path.exists():
        # Construct the url we need to download the page image
        page_url = 'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(box['page_id'], 7)
        # print(page_url)
        # Download the page image
        response = requests.get(page_url)
        # Open download as an image for editing
        img = Image.open(BytesIO(response.content))
        word = img.crop((box['left'] - 5,  box['top'] - 5, box['left'] + box['width'] + 5, box['top'] + box['height'] + 5))
        img.close()
        word.save(word_path)
    
def get_article_from_search(kw):
    '''
    Use the Trove API to find articles with the supplied keyword.
    '''
    params = {
        'q': f'text:"{kw}"',
        'zone': 'newspaper',
        'encoding': 'json',
        'n': NUM_WORDS,
        'key': API_KEY
    }
    response = requests.get('https://api.trove.nla.gov.au/v2/result', params=params)
    data = response.json()
    articles = data['response']['zone'][0]['records']['article']
    for article in articles:
        boxes = []
        try:
            boxes = get_word_boxes(article['troveUrl'])
        except KeyError:
            pass
        if boxes:
            crop_word(boxes[0], kw, article['id'])
        time.sleep(1)

Get all the words

In [ ]:
for word in WORD_LIST:
    get_article_from_search(word)

Create the composite image

Here we use a packing algorithm to try and fit the little word images (which are a variety of shapes and sizes) into one big box with as few gaps as possible. Adjust the WIDTH and HEIGHT values below to change the size of the composite.

In [ ]:
# Set width of composite image
WIDTH = 3000

# Set height of composite image
HEIGHT = 3000

# Set background colour of composite image
BG_COLOUR = (180, 180, 180)
In [ ]:
def get_image_data():
    images = []
    for im in [i for i in Path(IMG_DIR).glob('*.jpg')]:
        img = Image.open(im)
        h, w = img.size
        images.append((h+2, w+2, im.name))
    return images

def pack_images():
    images = get_image_data()
    packer = newPacker(sort_algo=SORT_NONE, rotation=False)
    for i in images:
        packer.add_rect(*i)
    packer.add_bin(WIDTH, HEIGHT)
    packer.pack()
    return len(images), packer.rect_list()

def create_composite(output_file=None):
    num_images, rectangles = pack_images()
    comp = Image.new('RGB', (WIDTH, HEIGHT), BG_COLOUR)
    for rect in rectangles:
        b,x,y,w,h,rid = rect
        # print(x,y, w, h, rid)
        word_path = Path(IMG_DIR, rid)
        word = Image.open(word_path)
        word = word.convert('RGB') 
        word_with_border = ImageOps.expand(word, border=1, fill=BG_COLOUR)
        comp.paste(word_with_border, (x, y, x+w, y+h))
    if not output_file:
        output_file = f'trove-words-{int(datetime.now().timestamp())}.jpg'
    comp.save(output_file)
    print(f'{len(rectangles)} of {num_images} images used')
    display(FileLink(output_file))

Run the cell below to create a composite image of the words you've harvested. The function will tell you how many of the harvested words it was able to fit into the composite. You can adjust the width and height of the composite to fit in more, or fill up gaps.

In [ ]:
create_composite()

Created by Tim Sherratt for the GLAM Workbench.