#!/usr/bin/env python # coding: utf-8 # # Get OCRd text from a digitised journal in Trove # # Many of the digitised periodicals available in Trove make OCRd text available for download. This notebook helps you download all the OCRd text from a single periodical – ­one text file for each issue. # # There are two main steps: # # - get a list of the issues available from a digitised periodical # - download the text of each issue # # Version 3 of the Trove API introduced the `/magazine/title` endpoint to retrieve information about a digitised periodical. Using this, it's possible to get a list of issues for any digitised periodical. However, the endpoint is currently buggy and I've found a number of cases where the list of issues is not complete. To avoid extra checking and work arounds, I'm not currently using the API to generate the list of issues. Instead, I'm using my earlier method that calls an internal API to deliver the HTML content of the 'Browse' panel. This browse panel includes links to all the issues of the journal. The API that populates it takes a `startIdx` parameter and returns a maximum of 20 issues. Using this you can work your way through the complete list of issues, scraping the basic metadata from the HTML, including the identifier, title, and number of pages. # # While you can download the complete text of an issue using the web interface, there's no option to do this with the API alone. The workaround is to mimic the web interface by constructing a download link using the issue identifier and number of pages. # ## Import what we need # In[ ]: # Let's import the libraries we need. import glob import os import re import shutil import time import pandas as pd import requests from bs4 import BeautifulSoup from IPython.display import HTML, display from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from slugify import slugify from tqdm.auto import tqdm s = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("http://", HTTPAdapter(max_retries=retries)) # ## What journal do you want? # # In the cell below, replace the `nla.obj-...` value with the identifier of the journal you want to harvest. You'll find the identifier in the url of the journal's landing page. An easy way to find it is to go to the [Trove Titles app](https://trove-titles.herokuapp.com/) and click on the 'Browse issues' button for the journal you're interested in. # # For example, if I click on the 'Browse issues' button for the *Angry Penguins broadsheet* it opens `http://nla.gov.au/nla.obj-320790312`, so the journal identifier is `nla.obj-320790312`. # In[ ]: # Replace the value in the single quotes with the identifier of your chosen journal journal_id = "nla.obj-320790312" # ## Define some functions to do the work # In[ ]: def harvest_metadata(obj_id): """ This calls an internal API from a journal landing page to extract a list of available issues. """ start_url = "https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c" # The initial startIdx value start = 0 # Number of results per page n = 20 issues = [] with tqdm(desc="Issues", leave=False) as pbar: # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens. while n == 20: # Get the browse page response = s.get(start_url.format(obj_id, start), timeout=60) # Beautifulsoup turns the HTML into an easily navigable structure soup = BeautifulSoup(response.text, "lxml") # Find all the divs containing issue details and loop through them details = soup.find_all(class_="l-item-info") for detail in details: issue = {} title = detail.find("h3") if title: issue["title"] = title.text issue["id"] = title.parent["href"].strip("/") else: issue["title"] = "No title" issue["id"] = detail.find("a")["href"].strip("/") try: # Get the issue details issue["details"] = detail.find( class_="obj-reference content" ).string.strip() except (AttributeError, IndexError): issue["details"] = "issue" # Get the number of pages try: issue["pages"] = int( re.search( r"^(\d+)", detail.find("a", attrs={"data-pid": issue["id"]}).text, flags=re.MULTILINE, ).group(1) ) except AttributeError: issue["pages"] = 0 issues.append(issue) # print(issue) time.sleep(0.2) # Increment the startIdx start += n # Set n to the number of results on the current page n = len(details) pbar.update(n) return issues def save_ocr(issues, obj_id, title=None, output_dir="journals"): """ Download the OCRd text for each issue. """ processed_issues = [] if not title: title = issues[0]["title"] output_path = os.path.join(output_dir, "{}-{}".format(slugify(title)[:50], obj_id)) texts_path = os.path.join(output_path, "texts") os.makedirs(texts_path, exist_ok=True) for issue in tqdm(issues, desc="Texts", leave=False): # Default values issue["text_file"] = "" if issue["pages"] != 0: # print(book['title']) # The index value for the last page of an issue will be the total pages - 1 last_page = issue["pages"] - 1 file_name = "{}-{}-{}.txt".format( slugify(issue["title"])[:50], slugify(issue["details"])[:50], issue["id"], ) file_path = os.path.join(texts_path, file_name) # Check to see if the file has already been harvested if os.path.exists(file_path) and os.path.getsize(file_path) > 0: # print('Already saved') issue["text_file"] = file_name else: url = "https://trove.nla.gov.au/{}/download?downloadOption=ocr&firstPage=0&lastPage={}".format( issue["id"], last_page ) # print(url) # Get the file r = s.get(url, timeout=120) # Check there was no error if r.status_code == requests.codes.ok: # Check that the file's not empty r.encoding = "utf-8" if len(r.text) > 0 and not r.text.isspace(): # Check that the file isn't HTML (some not found pages don't return 404s) if BeautifulSoup(r.text, "html.parser").find("html") is None: # If everything's ok, save the file with open(file_path, "w", encoding="utf-8") as text_file: text_file.write(r.text) issue["text_file"] = file_name time.sleep(1) processed_issues.append(issue) df = pd.DataFrame(processed_issues) # Remove empty directories try: os.rmdir(texts_path) os.rmdir(output_path) except OSError: # It's not empty, so add list of issues df.to_csv( os.path.join(output_path, "{}-issues.csv".format(obj_id)), index=False ) # ## Get a list of issues # # Run the cell below to extract a list of issues for your selected journal and save them to the `issues` variable. # In[ ]: issues = harvest_metadata(journal_id) # ## Download the OCRd texts # # Now we have the issues, we can download the texts! # # The OCRd text for each issue will be saved in an individual text file. By default, results will be saved under the `journals` directory, though you can change this by giving the `save_ocr()` function a different value for `output_dir`. # # The name of the journal directory is created using the journal title and journal id. Inside this directory is a CSV formatted file containing details of all the available issues, and a `texts` sub-directory to contain the downloaded text files. # # The individual file names are created using the journal title, issue details, and issue identifier. So the resulting hierarchy might look something like this: # # ``` # journals # - angry-penguins-nla.obj-320790312 # - nla.obj-320790312-issues.csv # - texts # - angry-penguins-broadsheet-no-1-nla.obj-320791009.txt # ``` # # The CSV list of issues includes the following fields: # # * `details` – string with issue details, might include dates, issue numbers etc. # * `id` – issue identifier # * `pages` – number of pages in this issue # * `text_file` – file name of any downloaded OCRd text # * `title` – journal title (as extracted from issue browse list, might differ from original journal title) # # Note that if the `text_file` field is empty, it means that no OCRd text could be extracted for that particular issue. Note also that if no OCRd text is available, no journal directory will be created, and nothing will be saved. # # Run the cell below to download the OCRd text. # In[ ]: save_ocr(issues, journal_id) # ## View and download the results # # If you've used the default output directory, you'll find the data in the `journals` directory. # If you're running this notebook using a cloud service (like Binder), you'll want to download your results. The cell below zips up the journal directory and creates a link for easy download. # In[ ]: journal_dir = glob.glob(os.path.join("journals", "*-{}".format(journal_id)))[0] shutil.make_archive(journal_dir, "zip", journal_dir) display(HTML("Download results")) display( HTML( f'{journal_dir}.zip' ) ) # Let's have a peek at the issues data... # In[ ]: df = pd.read_csv( os.path.join(journal_dir, "{}-issues.csv".format(journal_id)), keep_default_na=False ) df.head() # How many issues are available, and how many have OCRd text? # In[ ]: num_issues = df.shape[0] num_text = df.loc[df["text_file"] != ""].shape[0] print("{} / {} issues have OCRd text".format(num_issues, num_text)) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). # # Work on this notebook was supported by the [Humanities, Arts and Social Sciences (HASS) Data Enhanced Virtual Lab](https://tinker.edu.au/).