#!/usr/bin/env python # coding: utf-8 # # Download a page image # # The Trove web interface doesn't provide a way of getting high-resolution page images from newspapers. This simple app lets you download page images as complete, high-resolution JPG files. # In[ ]: import base64 import os import re from collections import OrderedDict from io import BytesIO from operator import itemgetter import arrow import ipywidgets as widgets import requests from bs4 import BeautifulSoup from IPython.display import HTML, display # In[ ]: get_ipython().run_cell_magic('capture', '', '# Load env variables\n%load_ext dotenv\n%dotenv\n') # In[ ]: titles = {} out = widgets.Output() def get_page_id(article_url): response = requests.get(article_url) soup = BeautifulSoup(response.text, "lxml") # Lines of OCR are in divs with the class 'zone' # 'onPage' limits to those on the current page zones = soup.select("div.zone.onPage") page_id = zones[0]["data-page-id"] return page_id def get_page_image(b): out.clear_output() # display_button() article = None page_id = None if article_url.value and "page" in article_url.value: page_id = re.search(r"page\/{0,1}(\d+)", article_url.value).group(1) elif article_url.value: page_id = get_page_id(article_url.value) else: end = arrow.get(date.value) start = end.shift(days=-1) date_query = "date:[{}Z TO {}Z]".format( start.format("YYYY-MM-DDT00:00:00"), end.format("YYYY-MM-DDT00:00:00") ) params = { "zone": "newspaper", "reclevel": "full", "encoding": "json", "n": "1", "q": "{} firstpageseq:{}".format(date_query, page.value), "l-title": title.value, "key": api_key.value, } response = requests.get("http://api.trove.nla.gov.au/v2/result", params=params) data = response.json() try: article = data["response"]["zone"][0]["records"]["article"][0] except (KeyError, IndexError): with out: display(HTML("Page not found!")) if article: page_id = re.search(r"page\/(\d+)", article["trovePageUrl"]).group(1) if page_id: # Construct the url we need to download the image page_url = ( "http://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format( page_id, size.value ) ) # Download the page image response = requests.get(page_url) img_file = BytesIO(response.content) # For the download link we can use a data uri -- a base64 encoded version of the file # Encode the file encoded_image = base64.b64encode(img_file.read()).decode() # Create a data uri string encoded_string = "data:image/jpeg;base64," + encoded_image # Reset to the beginning img_file.seek(0) filename = f"{page_id}-level{size.value}.jpg" with out: display( HTML( f'Download {filename}' ) ) display(widgets.Image(value=img_file.read(), format="jpg")) def get_titles(b): params = {"encoding": "json", "key": api_key.value} response = requests.get( "http://api.trove.nla.gov.au/v2/newspaper/titles", params=params ) data = response.json() title_list = [ (t["title"], t["id"]) for t in data["response"]["records"]["newspaper"] ] title_list.sort(key=itemgetter(0)) titles = OrderedDict(title_list) title.options = titles # ## Either enter an article or page url... # # You can use the url in your browser's location bar or an article or page permalink. # In[ ]: article_url = widgets.Text( placeholder="Enter an article or page url", description="Article/Page:", disabled=False, ) display(article_url) # ## Or provide a date, newspaper title, and page number # # If you've provided a url above these settings will be ignored. # # Get your own [Trove API key](http://help.nla.gov.au/trove/building-with-trove/api) and enter it below. # In[ ]: api_key = widgets.Text( placeholder="Enter your Trove API key", description="API key:", disabled=False ) display(api_key) # In[ ]: date = widgets.DatePicker(description="Date:", disabled=False) display(date) # In[ ]: page = widgets.IntText(value=1, description="Page:", disabled=False) display(page) # In[ ]: title = widgets.Dropdown( options=["Click the button to load titles"], description="Newspaper:", disabled=False, ) titles_button = widgets.Button( description="Load titles", disabled=False, button_style="", # 'success', 'info', 'warning', 'danger' or '' tooltip="Click to load titles", icon="", ) titles_button.on_click(get_titles) display(widgets.HBox([title, titles_button])) # ## Choose the resolution # Page images are available in seven resolutions that correspond to the zoom levels in the Trove web interface. As a rough guide: # # #