Generate a thumbnail image from a Trove newspaper article

In another notebook, I showed how to get high-resolution page images from newspapers. But what if you only want a nice square thumbnail for display purposes? This notebook gets the page image and then crops and resizes the top of the article to create a thumbnail.

Of course, if you're doing this to lots of articles you won't want to feed each one in manually. If you're viewing this notebook in app mode (no code visible), just click on the 'Edit app' button to see what's going on behind the scenes. You should be able to copy and modify the code to suit your purposes.

Briefly, the steps to generate a thumbnail are:

  • Scrape the article's HTML page to get the page identifier and the coordinates of the article on the page
  • Use the page identifier to download a high-res page image
  • Crop a square image from the page using the coordinates
  • Resize the cropped image
In [15]:
import ipywidgets as widgets
import requests
import random
import re
from IPython.display import display, HTML, FileLink, clear_output
from bs4 import BeautifulSoup
from PIL import Image, ImageOps
from io import BytesIO
import base64

titles = {}

results = widgets.Output()

def display_button():
    button = widgets.Button(
        description='Get thumbnail',
        disabled=False,
        button_style='primary',
        tooltip='Click to download',
        icon=''
    )
    button.on_click(get_article_thumbnail)
    display(button)
    
def get_box(zones):
    '''
    Loop through all the zones to find the outer limits of each boundary.
    Return a bounding box around the article.
    '''
    left = 10000
    right = 0
    top = 10000
    bottom = 0
    page_id = zones[0]['data-page-id']
    for zone in zones:
        if int(zone['data-x']) < left:
            left = int(zone['data-x'])
    for zone in zones:
        if int(zone['data-x']) < (left + 200):
            if int(zone['data-y']) < top:
                top = int(zone['data-y'])
            if (int(zone['data-x']) + int(zone['data-w'])) > right:
                right = int(zone['data-x']) + int(zone['data-w'])
            if (int(zone['data-y']) + int(zone['data-h'])) > bottom:
                bottom = int(zone['data-y']) + int(zone['data-h'])
    # For a square image
    if bottom > top + (right - left):
        bottom = top + (right - left)
    return {'page_id': page_id, 'left': left, 'top': top, 'right': right, 'bottom': bottom}


def get_illustration(zone):
    page_id = zone['data-page-id']
    left = int(zone['data-x'])
    right = int(zone['data-x']) + int(zone['data-w'])
    top = int(zone['data-y'])
    bottom = int(zone['data-y']) + int(zone['data-h'])
    return {'page_id': page_id, 'left': left, 'top': top, 'right': right, 'bottom': bottom}


def get_article_box(article_url, illustrated=False):
    '''
    Positional information about the article is attached to each line of the OCR output in data attributes.
    This function loads the HTML version of the article and scrapes the x, y, and width values for each line of text 
    to determine the coordinates of a box around the article.
    '''
    response = requests.get(article_url)
    soup = BeautifulSoup(response.text, 'lxml')
    # Lines of OCR are in divs with the class 'zone'
    # 'onPage' limits to those on the current page
    illustrations = soup.select('div.illustration.onPage')
    if illustrations and illustrated is True:
        zone = illustrations[0].parent
        box = get_illustration(zone)
    else:
        zones = soup.select('div.zone.onPage')
        box = get_box(zones)
    return box


def get_article_thumbnail(b):
    '''
    Extract a square thumbnail of the article from the page image.
    '''
    results.clear_output(wait=True)
    article_id = re.search(r'article\/{0,1}(\d+)', article_url.value).group(1)
    # Get position of article on the page(s)
    box = get_article_box('http://nla.gov.au/nla.news-article{}'.format(article_id), illustrated=illustrated.value)
    print(box)
    # Construct the url we need to download the page image
    page_url = 'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(box['page_id'], 7)
    # Download the page image
    response = requests.get(page_url)
    # Open download as an image for editing
    img = Image.open(BytesIO(response.content))
    # Use coordinates of top line to create a square box to crop thumbnail
    points = (box['left'], box['top'], box['right'], box['bottom'])
    # Crop image to article box
    thumb = img.crop(points)
    # Resize
    thumb.thumbnail((size.value, size.value), Image.ANTIALIAS)
    new_w, new_h = thumb.size
    # Squarify
    delta_w = size.value - new_w
    delta_h = size.value - new_h
    padding = (delta_w//2, delta_h//2, delta_w-(delta_w//2), delta_h-(delta_h//2))
    thumb = ImageOps.expand(thumb, padding, fill='white')
    # Create a filename for the thumbnail
    thumb_file = 'nla.news-article{}-{}.jpg'.format(article_id, size.value)
    # To avoid problems with saving & using local files, we're going to save the image as a file object
    # Create a file object to save the image into
    image_file = BytesIO()
    # Save the image into the file object
    thumb.save(image_file, 'JPEG')
    # Go to the start of the file object
    image_file.seek(0)
    # For the download link we can use a data uri -- a base64 encoded version of the file
    # Encode the file
    encoded_image = base64.b64encode(image_file.read()).decode()
    # Create a data uri string
    encoded_string = 'data:image/png;base64,' + encoded_image
    # Reset to the beginning
    image_file.seek(0)
    with results:
        # Create a download link using the data uri
        display(HTML('<a download="{0}" href="{1}">Download {0}</a>'.format(thumb_file, encoded_string)))
        # Display the image
        display(widgets.Image(
            value=image_file.read(),
            format='jpg'
        ))

Enter an article url...

You can use the url in your browser's location bar or an article permalink.

In [2]:
article_url = widgets.Text(
    placeholder='Enter an article url',
    description='Article/Page:',
    disabled=False
)
display(article_url)

Optional settings

Generate a square thumbnail with this height and width (in pixels).

In [3]:
size = widgets.BoundedIntText(
    min=100,
    max=800,
    value=500,
    step=50,
    description='Size:',
    disabled=False
)
display(size)

If there's an illustration in the article, check this box to use it as the thumbnail. The illustration will not be cropped, so whitespace will be added around the image to make it square.

In [14]:
illustrated = widgets.Checkbox(
    value=False,
    description='Use illustration as thumbnail',
    disabled=False
)

display(illustrated)

Get the thumbnail!

In [16]:
display_button()
display(results)
In [ ]: