#!/usr/bin/env python # coding: utf-8 # # Save a Trove newspaper article as an image # # Sometimes you want to be able to save a Trove newspaper article as an image. Unfortunately, the Trove web interface doesn't make this easy. The 'Download JPG' option actually loads an HTML page, and while you could individually save the images embedded in the HTML page, often articles are sliced up in ways that make the whole thing hard to read and use. # # One alternative is to [download the complete page](Save-page-image.ipynb) on which an article is published. I've also created a notebook that [generates a nice-looking thumbnail](Get-article-thumbnail.ipynb) for an article. This notebook takes things one step further – it grabs the page on which an article was published, but then it crops the page image to the boundaries of the article. The result is an image which presents the article as it was originally published. # # This is possible because information about the position of each line of text in an article is embedded in the display of the OCRd text. This notebook gathers all that positional information and uses it to draw a box that around the article. The OCRd text display also includes information about any additional parts of the article that are published on other pages. This means we can grab images of the article from every page on which it appears. So an article published across three pages, will generate three images. # # Here's an example. This is a [large, illustrated article](https://trove.nla.gov.au/newspaper/article/162833980) that is spread across two pages. If you download the JPG or PDF versions from Trove, you'll see they're a bit of a mess. # # # # Here are the two images of this article extracted by this notebook. # #
# # Much better! # # Note that the images are saved with a file name something like this: `nla.news-article162833980-16636766.jpg`. The first part of the file name, `nla.news-article162833980`, is the Trove identifier for this article. Just add it to `nla.gov.au/` and you'll have a link: # # https://nla.gov.au/nla.news-article162833980 # # The second number is the identifier for the page. So if you mislay the details of an article, you can always just look them up using the information in the file name. # ## How to use this notebook # # ### Running live on Binder # # * If you're viewing this notebook using [nbviewer.jupyter.org](http://nbviewer.jupyter.org) (look for 'nbviewer' in the logo or url), you'll first need to open up a *live* version on Binder. # * To open the notebook on Binder, just click on the set of rings in the top menu bar in NBViewer. They'll say 'Execute on Binder' when you hover over the icon. Be patient for a little while while Binder loads your live computing environment. # # # # * If your notebook's already running in Binder (look for 'binder.org' in the url), move on to the next step. # # ### Setting your options # # * Once the notebook has loaded in Binder, you're ready to go! # * In Trove, copy the url of the article you want to save as an image, and then come back here and paste it into the cell below where indicated. You can use the url in your browser's location bar or an article permalink. # * You can also set a maximum size for the images. # # # # ### Get your images! # # * From the 'Cell' menu select 'Run all'. Alternatively, you can hit 'Shift+Enter' to run each cell individually until you get to the bottom of the notebook. This will run the code below which goes off and prepares your images. # * The links and images will be displayed at [the bottom of the notebook](#Get-the-images!). Just click on the links to open the images, and select 'Save page as' to download them to your computer. # * If you want to get another article, just replace the url and 'Run all' again. # ## Load all the things we need # In[ ]: import re from io import BytesIO import requests from bs4 import BeautifulSoup from IPython.display import HTML, display from PIL import Image # In[ ]: def get_box(zones): """ Loop through all the zones to find the outer limits of each boundary. Return a bounding box around the article. """ left = 10000 right = 0 top = 10000 bottom = 0 page_id = zones[0]["data-page-id"] for zone in zones: if int(zone["data-y"]) < top: top = int(zone["data-y"]) if int(zone["data-x"]) < left: left = int(zone["data-x"]) if (int(zone["data-x"]) + int(zone["data-w"])) > right: right = int(zone["data-x"]) + int(zone["data-w"]) if (int(zone["data-y"]) + int(zone["data-h"])) > bottom: bottom = int(zone["data-y"]) + int(zone["data-h"]) return { "page_id": page_id, "left": left, "top": top, "right": right, "bottom": bottom, } def get_article_boxes(article_url): """ Positional information about the article is attached to each line of the OCR output in data attributes. This function loads the HTML version of the article and scrapes the x, y, and width values for each line of text to determine the coordinates of a box around the article. """ boxes = [] response = requests.get(article_url) soup = BeautifulSoup(response.text, "lxml") # Lines of OCR are in divs with the class 'zone' # 'onPage' limits to those on the current page zones = soup.select("div.zone.onPage") boxes.append(get_box(zones)) off_page_zones = soup.select("div.zone.offPage") if off_page_zones: current_page = off_page_zones[0]["data-page-id"] zones = [] for zone in off_page_zones: if zone["data-page-id"] == current_page: zones.append(zone) else: boxes.append(get_box(zones)) zones = [zone] current_page = zone["data-page-id"] boxes.append(get_box(zones)) return boxes def get_page_images(article_id, size): """ Extract an image of the article from the page image(s), save it, and return the filename(s). """ images = [] # Get position of article on the page(s) boxes = get_article_boxes("http://nla.gov.au/nla.news-article{}".format(article_id)) for box in boxes: # print(box) # Construct the url we need to download the page image page_url = ( "https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format( box["page_id"], 7 ) ) # Download the page image response = requests.get(page_url) # Open download as an image for editing img = Image.open(BytesIO(response.content)) # Use coordinates of top line to create a square box to crop thumbnail points = (box["left"], box["top"], box["right"], box["bottom"]) # Crop image to article box cropped = img.crop(points) # Resize if necessary if size: cropped.thumbnail((size, size), Image.ANTIALIAS) # Save and display thumbnail cropped_file = "nla.news-article{}-{}.jpg".format(article_id, box["page_id"]) cropped.save(cropped_file) images.append(cropped_file) return images def get_article(article_url, size): # Get the article record from the API article_id = re.search(r"article\/{0,1}(\d+)", article_url).group(1) # print(article_id) images = get_page_images(article_id, size) for image in images: display(HTML(f'Download {image}')) display(HTML(''.format(image))) # ## Set your options # In[ ]: # Copy the url of the article you want and paste it between the quotes article_url = "https://trove.nla.gov.au/newspaper/article/107024751?searchTerm=wragge" # Set this if you want to limit the size of the image. # Leave as None if you want them at full size max_size = None # ## Get the images! # # The links and images will load below once they're ready. # In[ ]: get_article(article_url, max_size) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). # Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb). # # Work on this notebook was supported by the [Humanities, Arts and Social Sciences (HASS) Data Enhanced Virtual Lab](https://tinker.edu.au/).