Notebook

Harvest SRU API results as JSON¶

You can query the People & Organisations data using the SRU (Search/Retrieve via URL) API. The easiest way to understand how to build SRU queries is to play around with the online interface. More information on the SRU protocol is available from the Library of Congress.

Trove's people and organisation records are available in a number of XML formats, the richest and most complex of which is EAC-CPF. However, the XML records are not easy to work with, so to simplify further processing, this notebook queries the SRU interface and then converts the XML results into JSON.

In [1]:

from pathlib import Path

import requests_cache
from bs4 import BeautifulSoup
from IPython.display import JSON
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

In [2]:

# Available SRU parameters

params = {
    # 'query': 'rec.identifier="http://nla.gov.au/nla.party-641680"', # Can specify a particular property, it not searches all (?) fields
    "query": "",
    "version": "1.1",
    "operation": "searchRetrieve",
    "recordSchema": "urn:isbn:1-931666-33-4",  # This specifies records in EAC-CPF format
    "maximumRecords": 100,
    "startRecord": 1,
    "resultSetTTL": 300,
    "recordPacking": "xml",
    "recordXPath": "",
    "sortKeys": "",
}

# SRU endpoint
api_url = "http://www.nla.gov.au/apps/srw/search/peopleaustralia"

In [3]:

def get_total_results(params):
    params["maximumRecords"] = 0
    response = s.get(api_url, params=params)
    soup = BeautifulSoup(response.content, "xml")
    return int(soup.find("numberOfRecords").string)

In [4]:

def soup_string(elem, prop):
    """
    Saves on memory by not keeping BS navigable string
    """
    if value := elem.find(prop):
        string = str(value.string).strip()
        if string == "None":
            string = value.get_text()
        return string


def get_attr(elem, prop, attr):
    if value := elem.find(prop):
        return value.attrs.get(attr)


def get_date(elem, prop):
    try:
        date = elem.find(prop)["standardDateTime"]
    except (KeyError):
        try:
            date = elem.find(prop)["standardDate"]
        except KeyError:
            date = soup_string(elem, prop)
    except TypeError:
        date = None
    return date


def get_dates(history):
    dates = {}
    if history:
        for event in history.find_all("maintenanceEvent"):
            event_type = soup_string(event, "eventType")
            event_date = get_date(event, "eventDateTime")
            if event_type == "created":
                dates["date_created"] = event_date
            elif event_type == "updated":
                dates["date_modified"] = event_date
    return dates


def get_names(identity):
    names = []
    for name_entry in identity.find_all("nameEntry"):
        name = {}
        for part in name_entry.find_all("part"):
            if part.has_attr("localType"):
                name_type = part["localType"]
            else:
                name_type = "name"
            try:
                name[name_type].append(str(part.string))
            except (KeyError, AttributeError):
                name[name_type] = [str(part.string)]
        if name_entry.find("authorizedForm"):
            name["authorized"] = True
        else:
            name["authorized"] = False
        names.append(name)
    return names


def get_exist_dates(description):
    exist_dates = {}
    dates = description.find("existDates")
    if dates:
        exist_dates["date_from"] = get_date(dates, "fromDate")
        exist_dates["date_to"] = get_date(dates, "toDate")
    return exist_dates


def get_places(description):
    places = []
    places_elem = description.find("places")
    if places_elem:
        for place_entry in places_elem.find_all("place"):
            place = {
                "place_type": soup_string(place_entry, "placeRole"),
                "name": soup_string(place_entry, "placeEntry"),
                "date_from": get_date(place_entry, "fromDate"),
                "date_to": get_date(place_entry, "toDate"),
            }
            places.append(place)
    return places


def get_events(description):
    events = []
    for event_list in description.find_all("chronList"):
        for event in event_list.find_all("chronItem"):
            events.append(
                {
                    "name": soup_string(event, "event"),
                    "date": get_date(event, "date"),
                    "date_from": get_date(event, "fromDate"),
                    "date_to": get_date(event, "toDate"),
                }
            )
    return events


def get_occupations(description):
    occupations = []
    if occupation_list := description.find("occupations"):
        for occupation in occupation_list.find_all("occupation"):
            occupations.append(soup_string(occupation, "term"))
    return occupations


def get_related_entities(eac):
    related = []
    for relation in eac.find_all("cpfRelation"):
        # Can be resourceRelation or cpfRelation
        if description := relation.find("descriptiveNote"):
            description = description.get_text().strip()
        else:
            description = None
        related.append(
            {
                "relation_type": relation.attrs.get("cpfRelationType"),
                "href": relation.attrs.get("href"),
                "name": soup_string(relation, "relationEntry"),
                "entity_type": get_attr(relation, "relationEntry", "localType"),
                "date_from": get_date(relation, "fromDate"),
                "date_to": get_date(relation, "toDate"),
                "description": description,
            }
        )
    return related


def get_related_resources(eac):
    related = []
    for relation in eac.find_all("resourceRelation"):
        # Can be resourceRelation or cpfRelation
        relation_type = relation.attrs.get("resourceRelationType")
        if relation.find("dc"):
            if description := relation.find_all("description"):
                description = " ".join([d.get_text() for d in description])
            related.append(
                {
                    "relation_type": relation_type,
                    "href": soup_string(relation, "identifier"),
                    "name": soup_string(relation, "title"),
                    "resource_type": None,
                    "contributor": soup_string(relation, "contributor"),
                    "date": soup_string(relation, "date"),
                    "description": description,
                }
            )
        else:
            if description := relation.find("abstract"):
                description = description.get_text()
            related.append(
                {
                    "relation_type": relation_type,
                    "href": relation.attrs.get("href"),
                    "name": soup_string(relation, "relationEntry"),
                    "resource_type": get_attr(relation, "relationEntry", "localType"),
                    "contributor": soup_string(relation, "name"),
                    "date": soup_string(relation, "date"),
                    "description": description,
                }
            )
    return related


def get_biog(description):
    biog = []
    for bio in description.find_all("biogHist"):
        for para in bio.find_all("p"):
            biog.append(str(para.string).strip())
    return " ".join(biog)


def get_sources(eac):
    sources = []
    for source_eac in eac.find_all("eac-cpf"):
        source = process_eac(source_eac)
        source["related_entities"] = get_related_entities(source_eac)
        source["related_resources"] = get_related_resources(source_eac)
        sources.append(source)
    return sources


def get_agency_details(agency_element):
    agency = {
        "agency_id": soup_string(agency_element, "agencyCode"),
        "agency_name": soup_string(agency_element, "agencyName"),
    }
    return agency


def get_eac_meta(eac):
    meta = {"record_id": soup_string(eac, "recordId")}
    control = eac.find("control")
    # agency
    meta.update(get_agency_details(control.find("maintenanceAgency")))
    meta.update(get_dates(control.find("maintenanceHistory")))
    return meta


def format_name(names, entity_type):
    authorized = None
    combined_names = []
    for name in names:
        if name["authorized"] is True:
            authorized = name
            break
    if not authorized:
        try:
            authorized = names[0]
        except IndexError:
            pass
    if authorized:
        for name_type in ["forename", "surname", "name", "parent"]:
            combined_names += authorized.get(name_type, [])
    return " ".join(combined_names)


def process_eac(eac):
    record = get_eac_meta(eac)
    identity = eac.find("identity")
    record["names"] = get_names(identity)
    record["entity_type"] = soup_string(identity, "entityType")
    record["entity_id"] = soup_string(identity, "entityId")
    record["name"] = format_name(record["names"], record["entity_type"])
    description = eac.find("description")
    if not description:
        description = eac.find("cpfDescription")
    record["dates"] = get_exist_dates(description)
    record["places"] = get_places(description)
    record["occupations"] = get_occupations(description)
    record["abstract"] = soup_string(description, "abstract")
    record["description"] = get_biog(description)
    record["events"] = get_events(description)
    record["sources"] = get_sources(eac)
    return record


def get_records(params):
    records = []
    response = s.get(api_url, params=params)
    soup = BeautifulSoup(response.content, "xml")
    for result in soup.find_all("record"):
        eac = result.find("eac-cpf")
        # get id info here
        record = process_eac(eac)
        record["trove_url"] = f"https://nla.gov.au/nla.party-{record['record_id']}"
        records.append(record)
    return records


def harvest_results(params):
    records = []
    total = get_total_results(params.copy())
    start = 1
    with tqdm(total=total) as pbar:
        while start <= total:
            params["start"] = start
            new_records = get_records(params)
            records += new_records
            start += 100
            pbar.update(len(new_records))
    return records

In [ ]:

search_params = params.copy()
search_params["query"] = "wragge"
results = harvest_results(search_params)

In [ ]:

JSON(results)

Some testing¶

In [85]:

# Test the processing code across the harvested data set
with Path("peau-data.xml").open("r") as xml_file:
    for i, xml in enumerate(xml_file):
        # if i < 100000:
        soup = BeautifulSoup(xml, "xml")
        eac = soup.find("eac-cpf")
        try:
            process_eac(eac)
        except AttributeError:
            print(soup.prettify())
            raise
        soup.decompose()

Created by Tim Sherratt for the GLAM Workbench.

The development of this notebook was supported by the Australian Cultural Data Engine.