#!/usr/bin/env python # coding: utf-8 # # Get covers (or any other pages) from a digitised journal in Trove # # In [another notebook](Get-text-from-a-Trove-journal.ipynb), I showed how to get issue metadata and OCRd texts from a digitised journal in Trove. It's also possible to download page images and PDFs. This notebook shows how to download all the cover images from a specified journal. With some minor modifications you could download any page, or range of pages. # ## Import what we need # In[1]: # Let's import the libraries we need. import io import os import re import shutil import time import zipfile import pandas as pd import requests_cache from bs4 import BeautifulSoup from IPython.display import HTML, display from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from tqdm.auto import tqdm s = requests_cache.CachedSession() retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("http://", HTTPAdapter(max_retries=retries)) # ## What journal do you want? # # In the cell below, replace the `nla.obj-...` value with the identifier of the journal you want to harvest. You'll find the identifier in the url of the journal's landing page. An easy way to find it is to go to the [Trove Titles app](https://trove-titles.herokuapp.com/) and click on the 'Browse issues' button for the journal you're interested in. # # For example, if I click on the 'Browse issues' button for the *Angry Penguins broadsheet* it opens `http://nla.gov.au/nla.obj-320790312`, so the journal identifier is `nla.obj-320790312`. # In[2]: # Replace the value in the single quotes with the identifier of your chosen journal journal_id = "nla.obj-320790312" # Where do you want to save the results? output_dir = "images" # Set up the data directory image_dir = os.path.join(output_dir, journal_id) os.makedirs(image_dir, exist_ok=True) # ## Define some functions to do the work # In[3]: def harvest_metadata(obj_id): """ This calls an internal API from a journal landing page to extract a list of available issues. """ start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c" # The initial startIdx value start = 0 # Number of results per page n = 20 issues = [] with tqdm(desc="Issues", leave=False) as pbar: # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens. while n == 20: # Get the browse page response = s.get(start_url.format(obj_id, start), timeout=60) # Beautifulsoup turns the HTML into an easily navigable structure soup = BeautifulSoup(response.text, "lxml") # Find all the divs containing issue details and loop through them details = soup.find_all(class_="l-item-info") for detail in details: issue = {} title = detail.find("h3") if title: issue["title"] = title.text issue["id"] = title.parent["href"].strip("/") else: issue["title"] = "No title" issue["id"] = detail.find("a")["href"].strip("/") try: # Get the issue details issue["details"] = detail.find( class_="obj-reference content" ).string.strip() except (AttributeError, IndexError): issue["details"] = "issue" # Get the number of pages try: issue["pages"] = int( re.search( r"^(\d+)", detail.find("a", attrs={"data-pid": issue["id"]}).text, flags=re.MULTILINE, ).group(1) ) except AttributeError: issue["pages"] = 0 issues.append(issue) # print(issue) if not response.from_cache: time.sleep(0.5) # Increment the startIdx start += n # Set n to the number of results on the current page n = len(details) pbar.update(n) return issues def save_page(issues, output_dir, page_num=1): """ Downloads the specified page from a list of journal issues. If you want to download a range of pages you can set the `lastPage` parameter to your end point. But beware the images are pretty large. """ # Loop through the issue metadata for issue in tqdm(issues): # print(issue['id']) id = issue["id"] # Check to see if the page of this issue has already been downloaded if not os.path.exists( os.path.join(image_dir, "{}-{}.jpg".format(id, page_num)) ): # Change lastPage to download a range of pages url = "https://nla.gov.au/{0}/download?downloadOption=zip&firstPage={1}&lastPage={1}".format( id, page_num - 1 ) # Get the file r = s.get(url, timeout=60) # print(r.url, r.status_code) # The image is in a zip, so we need to extract the contents into the output directory z = zipfile.ZipFile(io.BytesIO(r.content)) z.extractall(image_dir) time.sleep(0.5) # ## Get a list of issues # # Run the cell below to extract a list of issues for your selected journal and save them to the `issues` variable. # In[4]: issues = harvest_metadata(journal_id) # Convert the list of issues to a Pandas dataframe and have a look inside. # In[5]: df = pd.DataFrame(issues) df.head() # Save the data to a CSV file. # In[6]: df.to_csv("{}/issues.csv".format(image_dir), index=False) # ## Get the images # # Run the cell below to work through the list of issues, downloading the first page of each, and saving it to the specified directory. Note that the images can be quite large! # In[ ]: save_page(issues, image_dir, 1) # ## Download the results # # If you're running this notebook using a cloud service (like Binder), you'll want to download your results. The cell below zips up the journal directory and creates a link for easy download. # In[8]: shutil.make_archive(image_dir, "zip", image_dir) display(HTML("Download results")) display( HTML(f'{image_dir}.zip') ) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). # # Work on this notebook was supported by the [Humanities, Arts and Social Sciences (HASS) Data Enhanced Virtual Lab](https://tinker.edu.au/).