#!/usr/bin/env python # coding: utf-8 # # Create a flat list of organisations contributing metadata to Trove # # The Trove API includes the `contributor` endpoint for retrieving information about organisations whose metadata is aggregated into Trove. If you include the `reclevel=full` parameter, you can get details of all contributors with a single API request like this: # # ``` # https://api.trove.nla.gov.au/v2/contributor?encoding=json&reclevel=full&key=[YOUR API KEY] # ``` # # However, the data can be difficult to use because of its nested structure, with some organisations having several levels of subsidiaries. There's also some inconsistency in the way nested records are named. This notebook aims to work around these problems by converting the nested data into a single flat list of organisations. # # This code is used to make weekly harvests of the contributor data which are saved [in this repository](https://github.com/wragge/trove-contributor-totals). # ## Set things up # In[170]: import datetime import json import os from pathlib import Path import pandas as pd import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry # Create a session that will automatically retry on server errors s = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) s.mount("http://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries)) # In[171]: get_ipython().run_cell_magic('capture', '', '# Load variables from the .env file if it exists\n# Use %%capture to suppress messages\n%load_ext dotenv\n%dotenv\n') # In[172]: # Insert your Trove API key API_KEY = "YOUR API KEY" if os.getenv("TROVE_API_KEY"): API_KEY = os.getenv("TROVE_API_KEY") # ## Define some functions # In[173]: def get_contrib_details(record, parent=None): """ Get the details of a contributor, recursing through children if present. """ records = [] # Get the basic details details = { "id": record["id"], "name": record["name"], "total_items": int(record["totalholdings"]), "parent": None, } # Add nuc if present if "nuc" in record: details["nuc"] = record["nuc"][0] else: details["nuc"] = None # If this is a child record, combine parent and child names if parent: if not record["name"].startswith(parent["name"]): details["name"] = f"{parent['name']} {record['name']}" # Add parent id details["parent"] = parent["id"] records = [details] if "children" in record: # Pass forward combined names for deeply nested orgs record["name"] = details["name"] records += get_children(record) return records def get_children(parent): """ Process child records. """ children = [] for child in parent["children"]["contributor"]: children += get_contrib_details(child, parent) return children def get_contributors(save_json=True): """ Get all Trove contributors, flattening the nested structure and optionally saving the original JSON. """ contributors = [] params = {"encoding": "json", "reclevel": "full", "key": API_KEY} response = s.get("https://api.trove.nla.gov.au/v2/contributor", params=params) data = response.json() # Save the original nested JSON response if save_json: Path( f"trove-contributors-{datetime.datetime.now().strftime('%Y%m%d')}.json" ).write_text(json.dumps(data)) # Get details of each contributor for contrib in data["response"]["contributor"]: contributors += get_contrib_details(contrib) return contributors # ## Get the data # In[174]: contributors = get_contributors() # Convert the data to a dataframe. # In[175]: df = pd.DataFrame(contributors) df.head() # How many contributors are listed? # In[178]: df.shape[0] # How many of the contributor records include NUCs? # In[179]: df.loc[df["nuc"].notnull()].shape[0] # Save the data to a CSV file. # In[177]: df[["id", "nuc", "name", "parent", "total_items"]].to_csv( f"trove-contributors-{datetime.datetime.now().strftime('%Y%m%d')}.csv", index=False ) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge).