Get the page coordinates of a digitised newspaper article from Trove

In [ ]:
import requests
from IPython.display import display
from IPython.display import Image as DisplayImage
from bs4 import BeautifulSoup
from PIL import Image, ImageDraw
from io import BytesIO
import re
In [ ]:
def get_box(zones):
    '''
    Loop through all the zones to find the outer limits of each boundary.
    Return a bounding box around the article.
    '''
    left = 10000
    right = 0
    top = 10000
    bottom = 0
    page_id = zones[0]['data-page-id']
    for zone in zones:
        if int(zone['data-y']) < top:
            top = int(zone['data-y'])
        if int(zone['data-x']) < left:
            left = int(zone['data-x'])
        if (int(zone['data-x']) + int(zone['data-w'])) > right:
            right = int(zone['data-x']) + int(zone['data-w'])
        if (int(zone['data-y']) + int(zone['data-h'])) > bottom:
            bottom = int(zone['data-y']) + int(zone['data-h'])
    return {'page_id': page_id, 'left': left, 'top': top, 'right': right, 'bottom': bottom}
    
def get_article_boxes(article_url):
    '''
    Positional information about the article is attached to each block of the OCR output in data attributes.
    This function loads the HTML version of the article and scrapes the x, y, and width values for each block of text 
    to determine the coordinates of a box around the article.
    '''
    boxes = []
    # Get the article page
    response = requests.get(article_url)
    # Load in BS4
    soup = BeautifulSoup(response.text, 'lxml')
    # Lines of OCR are in divs with the class 'zone'
    # 'onPage' limits to those on the current page
    zones = soup.select('div.zone.onPage')
    boxes.append(get_box(zones))
    off_page_zones = soup.select('div.zone.offPage')
    if off_page_zones:
        current_page = off_page_zones[0]['data-page-id']
        zones = []
        for zone in off_page_zones:
            if zone['data-page-id'] == current_page:
                zones.append(zone)
            else:
                boxes.append(get_box(zones))
                zones = [zone]
                current_page = zone['data-page-id']
        boxes.append(get_box(zones))    
    return boxes

def display_boxes(boxes):
    for box in boxes:
        # Construct the url we need to download the page image
        page_url = 'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(box['page_id'], 7)
        # Download the page image
        response = requests.get(page_url)
        # Open download as an image for editing
        img = Image.open(BytesIO(response.content))
        img = img.convert(mode='RGB')
        draw = ImageDraw.Draw(img)
        # Draw a rectangle on the image
        draw.rectangle([(box['left'], box['top']), (box['right'], box['bottom'])], outline=(0,255,0), width=20)
        buffer = BytesIO()
        img.save(buffer, format='JPEG')
        display(DisplayImage(data=buffer.getvalue(), width=400))
In [ ]:
boxes = get_article_boxes('https://trove.nla.gov.au/newspaper/article/258166628?searchTerm=wragge')
print(boxes)
In [ ]:
display_boxes(boxes)

What can I do with this?

In the GLAM Workbench there's a notebook (and app) to save an article as an image using the code above. But what about building something like this into a pipeline to assemble a dataset of images? Perhaps illustrated advertisements by decade, or by product type, of from the Australian Women's Weekly? A collection of weather maps?


Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.