#!/usr/bin/env python # coding: utf-8 # # Exploring digitised maps in Trove # # If you've ever poked around in Trove's 'map' zone, you might have noticed the beautiful deep-zoomable images available for many of the NLA's digitised maps. Even better, in many cases the high-resolution TIFF versions of the digitised maps are available for download. # # I knew there were lots of great maps you could download from Trove, but how many? And how big were the files? I thought I'd try to quantify this a bit by harvesting and analysing the metadata. # # The size of the downloadable files (both in bytes and pixels) are [embedded within the landing pages](https://nbviewer.jupyter.org/github/GLAM-Workbench/trove-books/blob/master/Metadata-for-Trove-digitised-works.ipynb) for the digitised maps. So harvesting the metadata involves a number of steps: # # * Use the Trove API to search for maps that include the phrase "nla.obj" – this will filter the results to maps that have been digitised and are available through Trove # * Work through the results, checking to see if the record includes a link to a digital copy. # * If there is a digital copy, extract the embedded work data from the landing page. # * Scrape the copyright status from the page. # # **2023 update!** It turns out that embedded within the embedded data are MARC descriptions that include some other metadata that's not available through the API. This includes the map scale and coordinates. The coordinates can either be a point, or a bounding box. I've saved these values as well, and explored some ways of parsing and visualising the coordinates in this notebook. # # The fields in the harvested dataset are: # # * `title` – title of the map # * `url` – url to the map in the digitised file viewer # * `work_url` – url to the work in the Trove map category # * `identifier` – NLA identifier # * `date` – date published or created # * `creators` – creators of the map # * `publication` – publication place, publisher, and publication date (if available) # * `extent` – physical description of map # * `copyright_status` – copyright status based on available metadata (scraped from web page) # * `scale` – map scale # * `coordinates` – map coordinates, either a point or a bounding box (format is 'W--E/N--S', eg: 'E 130⁰50'--E 131⁰00'/S 12⁰30'--S 12⁰40') # * `filesize_string` – filesize string in MB # * `filesize` – size of TIFF file in bytes # * `width` – width of TIFF in pixels # * `height` – height of TIFF in pixels # * `copy_role` – I'm not sure what the values in this field signify, but as described below, you can use them to download high-res TIFF images # # ## Getting map images # # There are a couple of undocumented tricks that make it easy to programatically download images of the maps. # # * To view the JPG version, just add `/image` to the map url. For example: http://nla.gov.au/nla.obj-232162256/image # * The JPG image will be at the highest available resolution, but you requests smaller versions using the `wid` parameter to specify a pixel width. For example: http://nla.gov.au/nla.obj-232162256/image?wid=400 # * There seems to be an upper limit for the resolution of the JPG versions, higher resolutions might be available via the TIFF file which you can download by adding the `copy_role` value to the url. For example, if the `copy_role` is 'm' this url will download the TIFF: http://nla.gov.au/nla.obj-232162256/m (note that some of these files are very, very large – you might want to check the `filesize` before downloading) # # ## Setting things up # In[1]: import datetime import json import os import re import time import warnings warnings.simplefilter(action="ignore", category=FutureWarning) import altair as alt import pandas as pd import requests_cache from bs4 import BeautifulSoup from IPython.display import FileLink, display from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from tqdm.auto import tqdm s = requests_cache.CachedSession() retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("http://", HTTPAdapter(max_retries=retries)) # In[2]: get_ipython().run_cell_magic('capture', '', '# Load variables from the .env file if it exists\n# Use %%capture to suppress messages\n%load_ext dotenv\n%dotenv\n') # ## You'll need a Trove API key to harvest the data. # In[3]: # This creates a variable called 'api_key', paste your key between the quotes api_key = "" # Use an api key value from environment variables if it is available (useful for testing) if os.getenv("TROVE_API_KEY"): api_key = os.getenv("TROVE_API_KEY") # This displays a message with your key print("Your API key is: {}".format(api_key)) # ## Define some functions to do the work # In[4]: def get_total_results(params): """ Get the total number of results for a search. """ these_params = params.copy() these_params["n"] = 0 response = s.get("https://api.trove.nla.gov.au/v2/result", params=these_params) data = response.json() return int(data["response"]["zone"][0]["records"]["total"]) def get_fulltext_url(links): """ Loop through the identifiers to find a link to the digital version of the journal. """ url = None for link in links: if link["linktype"] == "fulltext" and "nla.obj" in link["value"]: url = link["value"] break return url def get_copyright_status(response=None, url=None): """ Scrape copyright information from a digital work page. """ if url and not response: response = s.get(url) if response: soup = BeautifulSoup(response.text, "lxml") try: copyright_status = str( soup.find("div", id="tab-access").find("p", class_="decorative").string ) return copyright_status # No access tab except AttributeError: pass return None def get_work_data(url): """ Extract work data in a JSON string from the work's HTML page. """ response = s.get(url) try: work_data = json.loads( re.search( r"var work = JSON\.parse\(JSON\.stringify\((\{.*\})", response.text ).group(1) ) except (AttributeError, TypeError): work_data = {} # else: # If there's no copyright info in the work data, then scrape it # if "copyrightPolicy" not in work_data: # work_data["copyrightPolicy"] = get_copyright_status(response) if not response.from_cache: time.sleep(0.2) return work_data def find_field_content(record, tag, subfield): """ Loop through a MARC record looking for tag/subfield. If found, return the subfield value. """ try: for field in record["datafield"]: if field["tag"] == tag: if isinstance(field["subfield"], list): for sfield in field["subfield"]: if sfield["code"] == subfield: return sfield["content"] else: if field["subfield"]["code"] == subfield: return field["subfield"]["content"] except (KeyError, TypeError): pass return None def get_marc_field(work_data, tag, subfield): """ Loop through all the MARC records in work metadata looking for a tag/subfield. If found, return the subfield value. """ if "marcData" in work_data and work_data["marcData"]: for record in work_data["marcData"]["record"]: content = find_field_content(record, tag, subfield) if content: return content return None def format_bytes(size): """ Format bytes as a human-readable string """ # 2**10 = 1024 power = 2**10 n = 0 power_labels = {0: "", 1: "K", 2: "M", 3: "G", 4: "T"} while size > power: size /= power n += 1 return size, power_labels[n] + "B" def get_publication_details(work_data): """ Get MARC values for publication details and combine into a single string. """ parts = [] for code in ["a", "b", "c"]: value = get_marc_field(work_data, 260, code) if value: parts.append(str(value)) return " ".join(parts) def get_map_data(work_data): """ Look for file size information in the embedded data """ map_data = {} width = None height = None num_bytes = None try: # Make sure there's a downloadable version if ( work_data.get("accessConditions") == "Unrestricted" and "copies" in work_data ): for copy in work_data["copies"]: # Get the pixel dimensions if "technicalmetadata" in copy: width = copy["technicalmetadata"].get("width") height = copy["technicalmetadata"].get("height") # Get filesize in bytes elif ( copy["copyrole"] in ["m", "o", "i", "fd"] and copy["access"] == "true" ): num_bytes = copy.get("filesize") copy_role = copy["copyrole"] if width and height and num_bytes: size, unit = format_bytes(num_bytes) # Convert bytes to something human friendly map_data["filesize_string"] = "{:.2f}{}".format(size, unit) map_data["filesize"] = num_bytes map_data["width"] = width map_data["height"] = height map_data["copy_role"] = copy_role except AttributeError: pass return map_data def get_maps(): """ Harvest metadata about maps. """ url = "http://api.trove.nla.gov.au/v2/result" maps = [] params = { "q": '"nla.obj-"', "zone": "map", "l-availability": "y", "l-format": "Map/Single map", "bulkHarvest": "true", # Needed to maintain a consistent order across requests "key": api_key, "n": 100, "encoding": "json", } start = "*" total = get_total_results(params) with tqdm(total=total) as pbar: while start: params["s"] = start response = s.get(url, params=params) data = response.json() # If there's a startNext value then we get it to request the next page of results try: start = data["response"]["zone"][0]["records"]["nextStart"] except KeyError: start = None for work in tqdm( data["response"]["zone"][0]["records"]["work"], leave=False ): # Check to see if there's a link to a digital version try: fulltext_url = get_fulltext_url(work["identifier"]) except KeyError: pass else: if fulltext_url: work_data = get_work_data(fulltext_url) map_data = get_map_data(work_data) obj_id = re.search(r"(nla\.obj\-\d+)", fulltext_url).group(1) try: contributors = "|".join(work.get("contributor")) except TypeError: contributors = work.get("contributor") # Get basic metadata # You could add more work data here # Check the Trove API docs for work record structure map_data["title"] = work["title"] map_data["url"] = fulltext_url map_data["work_url"] = work.get("troveUrl") map_data["identifier"] = obj_id map_data["date"] = work.get("issued") map_data["creators"] = contributors map_data["publication"] = get_publication_details(work_data) map_data["extent"] = work_data.get("extent") # I think the copyright status scraped from the page (below) is more likely to be accurate # map_data["copyright_policy"] = work_data.get("copyrightPolicy") map_data["copyright_status"] = get_copyright_status( url=fulltext_url ) map_data["scale"] = get_marc_field(work_data, 255, "a") map_data["coordinates"] = get_marc_field(work_data, 255, "c") maps.append(map_data) # print(map_data) if not response.from_cache: time.sleep(0.2) pbar.update(100) return maps # ## Download map data # In[ ]: maps = get_maps() # ## Convert to dataframe and save to CSV # In[ ]: # Convert to dataframe # Convert dtypes converts numbers to integers rather than floats df = pd.DataFrame(maps).convert_dtypes() # Reorder columns df = df[ [ "identifier", "title", "url", "work_url", "date", "creators", "publication", "extent", "copyright_status", "scale", "coordinates", "filesize_string", "filesize", "width", "height", "copy_role", ] ] df.head() # In[ ]: # Save to CSV csv_file = f"single_maps_{datetime.datetime.now().strftime('%Y%m%d')}.csv" df.to_csv(csv_file, index=False) display(FileLink(csv_file)) # ## Let's explore the results # In[5]: # Reload data from CSV if necessary df = pd.read_csv( "https://raw.githubusercontent.com/GLAM-Workbench/trove-maps-data/main/single_maps_20230131.csv" ) # How many digitised maps are available? # In[6]: print("{:,} maps".format(df.shape[0])) # How many of the maps have high-resolution downloads? # In[7]: df.loc[df["filesize"].notnull()].shape # What are the `copy_role` values? # In[8]: df["copy_role"].value_counts() # How much map data is available for download? # In[9]: size, unit = format_bytes(df["filesize"].sum()) print("{:.2f}{}".format(size, unit)) # What's the copyright status of the maps? # In[10]: df["copyright_status"].value_counts() # Let's show the copyright status as a chart... # In[11]: counts = df["copyright_status"].value_counts().to_frame().reset_index() counts.columns = ["status", "count"] alt.Chart(counts).mark_bar().encode( y="status:N", x="count", tooltip="count" ).properties(height=200) # Let's look at the sizes of the download files. To make this easier we'll divide the filesizes into ranges (bins) and count the number of files in each range. # In[12]: # Convert bytes to mb df["mb"] = df["filesize"] / 2**10 / 2**10 # Create 500mb-sized bins and count the number of files in each bin sizes = ( pd.cut(df["mb"], bins=[0, 500, 1000, 1500, 2000, 3000, 3500]) .value_counts() .to_frame() .reset_index() ) sizes.columns = ["mb", "count"] # Convert intervals to strings for display in chart sizes["mb"] = sizes["mb"].astype(str) sizes # In[13]: alt.Chart(sizes).mark_bar().encode( x=alt.X("mb:N", sort=None), y="count:Q", tooltip="count:Q" ).properties(width=400) # So while most are less than 500MB, more than 10,000 are between 0.5 and 1GB! # What's the biggest file available for download? # In[14]: df.iloc[df["filesize"].idxmax()] # All downloads greater than 3GB. # In[15]: df.loc[(df["filesize"] / 2**10 / 2**10 / 2**10) > 3] # The widest image? # In[16]: df.iloc[df["width"].idxmax()] # The tallest image? # In[17]: df.iloc[df["height"].idxmax()] # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). # # Work on this notebook was originally supported by the [Humanities, Arts and Social Sciences (HASS) Data Enhanced Virtual Lab](https://tinker.edu.au/). #