# Generate a thumbnail image from a Trove newspaper article¶

In another notebook, I showed how to get high-resolution page images from newspapers. But what if you only want a nice square thumbnail for display purposes? This notebook gets the page image and then crops and resizes the top of the article to create a thumbnail.

Of course, if you're doing this to lots of articles you won't want to feed each one in manually. If you're viewing this notebook in app mode (no code visible), just click on the 'Edit app' button to see what's going on behind the scenes. You should be able to copy and modify the code to suit your purposes.

Briefly, the steps to generate a thumbnail are:

• Scrape the article's HTML page to get the page identifier and the coordinates of the article on the page
• Crop a square image from the page using the coordinates
• Resize the cropped image
In [1]:
import ipywidgets as widgets
import requests
import random
import re
from IPython.display import display, HTML, FileLink, clear_output
from bs4 import BeautifulSoup
from PIL import Image, ImageOps
from io import BytesIO
import base64

titles = {}

results = widgets.Output()

def display_button():
button = widgets.Button(
description='Get thumbnail',
disabled=False,
button_style='primary',
icon=''
)
button.on_click(get_article_thumbnail)
display(button)

def get_box(zones):
'''
Loop through all the zones to find the outer limits of each boundary.
Return a bounding box around the article.
'''
left = 10000
right = 0
top = 10000
bottom = 0
page_id = zones[0]['data-page-id']
for zone in zones:
if int(zone['data-x']) < left:
left = int(zone['data-x'])
for zone in zones:
if int(zone['data-x']) < (left + 200):
if int(zone['data-y']) < top:
top = int(zone['data-y'])
if (int(zone['data-x']) + int(zone['data-w'])) > right:
right = int(zone['data-x']) + int(zone['data-w'])
if (int(zone['data-y']) + int(zone['data-h'])) > bottom:
bottom = int(zone['data-y']) + int(zone['data-h'])
# For a square image
if bottom > top + (right - left):
bottom = top + (right - left)
return {'page_id': page_id, 'left': left, 'top': top, 'right': right, 'bottom': bottom}

def get_illustration(zone):
page_id = zone['data-page-id']
left = int(zone['data-x'])
right = int(zone['data-x']) + int(zone['data-w'])
top = int(zone['data-y'])
bottom = int(zone['data-y']) + int(zone['data-h'])
return {'page_id': page_id, 'left': left, 'top': top, 'right': right, 'bottom': bottom}

def get_article_box(article_url, illustrated=False):
'''
Positional information about the article is attached to each line of the OCR output in data attributes.
This function loads the HTML version of the article and scrapes the x, y, and width values for each line of text
to determine the coordinates of a box around the article.
'''
response = requests.get(article_url)
soup = BeautifulSoup(response.text, 'lxml')
# Lines of OCR are in divs with the class 'zone'
# 'onPage' limits to those on the current page
illustrations = soup.select('div.illustration.onPage')
if illustrations and illustrated is True:
zone = illustrations[0].parent
box = get_illustration(zone)
else:
zones = soup.select('div.zone.onPage')
box = get_box(zones)
return box

def get_article_thumbnail(b):
'''
Extract a square thumbnail of the article from the page image.
'''
results.clear_output(wait=True)
article_id = re.search(r'article\/{0,1}(\d+)', article_url.value).group(1)
# Get position of article on the page(s)
box = get_article_box('http://nla.gov.au/nla.news-article{}'.format(article_id), illustrated=illustrated.value)
# print(box)
page_url = 'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(box['page_id'], 7)
response = requests.get(page_url)
img = Image.open(BytesIO(response.content))
# Use coordinates of top line to create a square box to crop thumbnail
points = (box['left'], box['top'], box['right'], box['bottom'])
# Crop image to article box
thumb = img.crop(points)
# Resize
thumb.thumbnail((size.value, size.value), Image.ANTIALIAS)
new_w, new_h = thumb.size
# Squarify
delta_w = size.value - new_w
delta_h = size.value - new_h
padding = (delta_w//2, delta_h//2, delta_w-(delta_w//2), delta_h-(delta_h//2))
# Create a filename for the thumbnail
thumb_file = 'nla.news-article{}-{}.jpg'.format(article_id, size.value)
# To avoid problems with saving & using local files, we're going to save the image as a file object
# Create a file object to save the image into
image_file = BytesIO()
# Save the image into the file object
thumb.save(image_file, 'JPEG')
# Go to the start of the file object
image_file.seek(0)
# For the download link we can use a data uri -- a base64 encoded version of the file
# Encode the file
# Create a data uri string
encoded_string = 'data:image/png;base64,' + encoded_image
# Reset to the beginning
image_file.seek(0)
with results:
# Display the image
display(widgets.Image(
format='jpg'
))


## Enter an article url...¶

You can use the url in your browser's location bar or an article permalink.

In [2]:
article_url = widgets.Text(
placeholder='Enter an article url',
description='Article/Page:',
disabled=False
)
display(article_url)


## Optional settings¶

Generate a square thumbnail with this height and width (in pixels).

In [3]:
size = widgets.BoundedIntText(
min=100,
max=800,
value=500,
step=50,
description='Size:',
disabled=False
)
display(size)


If there's an illustration in the article, check this box to use it as the thumbnail. The illustration will not be cropped, so whitespace will be added around the image to make it square.

In [4]:
illustrated = widgets.Checkbox(
value=False,
description='Use illustration as thumbnail',
disabled=False
)

display(illustrated)


## Get the thumbnail!¶

In [5]:
display_button()
display(results)


Created by Tim Sherratt for the GLAM Workbench.