You can query the People & Organisations data using the SRU (Search/Retrieve via URL) API. The easiest way to understand how to build SRU queries is to play around with the online interface. More information on the SRU protocol is available from the Library of Congress.
Trove's people and organisation records are available in a number of XML formats, the richest and most complex of which is EAC-CPF. However, the XML records are not easy to work with, so to simplify further processing, this notebook queries the SRU interface and then converts the XML results into JSON.
from pathlib import Path
import requests_cache
from bs4 import BeautifulSoup
from IPython.display import JSON
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
# Available SRU parameters
params = {
# 'query': 'rec.identifier="http://nla.gov.au/nla.party-641680"', # Can specify a particular property, it not searches all (?) fields
"query": "",
"version": "1.1",
"operation": "searchRetrieve",
"recordSchema": "urn:isbn:1-931666-33-4", # This specifies records in EAC-CPF format
"maximumRecords": 100,
"startRecord": 1,
"resultSetTTL": 300,
"recordPacking": "xml",
"recordXPath": "",
"sortKeys": "",
}
# SRU endpoint
api_url = "http://www.nla.gov.au/apps/srw/search/peopleaustralia"
def get_total_results(params):
params["maximumRecords"] = 0
response = s.get(api_url, params=params)
soup = BeautifulSoup(response.content, "xml")
return int(soup.find("numberOfRecords").string)
def soup_string(elem, prop):
"""
Saves on memory by not keeping BS navigable string
"""
if value := elem.find(prop):
string = str(value.string).strip()
if string == "None":
string = value.get_text()
return string
def get_attr(elem, prop, attr):
if value := elem.find(prop):
return value.attrs.get(attr)
def get_date(elem, prop):
try:
date = elem.find(prop)["standardDateTime"]
except (KeyError):
try:
date = elem.find(prop)["standardDate"]
except KeyError:
date = soup_string(elem, prop)
except TypeError:
date = None
return date
def get_dates(history):
dates = {}
if history:
for event in history.find_all("maintenanceEvent"):
event_type = soup_string(event, "eventType")
event_date = get_date(event, "eventDateTime")
if event_type == "created":
dates["date_created"] = event_date
elif event_type == "updated":
dates["date_modified"] = event_date
return dates
def get_names(identity):
names = []
for name_entry in identity.find_all("nameEntry"):
name = {}
for part in name_entry.find_all("part"):
if part.has_attr("localType"):
name_type = part["localType"]
else:
name_type = "name"
try:
name[name_type].append(str(part.string))
except (KeyError, AttributeError):
name[name_type] = [str(part.string)]
if name_entry.find("authorizedForm"):
name["authorized"] = True
else:
name["authorized"] = False
names.append(name)
return names
def get_exist_dates(description):
exist_dates = {}
dates = description.find("existDates")
if dates:
exist_dates["date_from"] = get_date(dates, "fromDate")
exist_dates["date_to"] = get_date(dates, "toDate")
return exist_dates
def get_places(description):
places = []
places_elem = description.find("places")
if places_elem:
for place_entry in places_elem.find_all("place"):
place = {
"place_type": soup_string(place_entry, "placeRole"),
"name": soup_string(place_entry, "placeEntry"),
"date_from": get_date(place_entry, "fromDate"),
"date_to": get_date(place_entry, "toDate"),
}
places.append(place)
return places
def get_events(description):
events = []
for event_list in description.find_all("chronList"):
for event in event_list.find_all("chronItem"):
events.append(
{
"name": soup_string(event, "event"),
"date": get_date(event, "date"),
"date_from": get_date(event, "fromDate"),
"date_to": get_date(event, "toDate"),
}
)
return events
def get_occupations(description):
occupations = []
if occupation_list := description.find("occupations"):
for occupation in occupation_list.find_all("occupation"):
occupations.append(soup_string(occupation, "term"))
return occupations
def get_related_entities(eac):
related = []
for relation in eac.find_all("cpfRelation"):
# Can be resourceRelation or cpfRelation
if description := relation.find("descriptiveNote"):
description = description.get_text().strip()
else:
description = None
related.append(
{
"relation_type": relation.attrs.get("cpfRelationType"),
"href": relation.attrs.get("href"),
"name": soup_string(relation, "relationEntry"),
"entity_type": get_attr(relation, "relationEntry", "localType"),
"date_from": get_date(relation, "fromDate"),
"date_to": get_date(relation, "toDate"),
"description": description,
}
)
return related
def get_related_resources(eac):
related = []
for relation in eac.find_all("resourceRelation"):
# Can be resourceRelation or cpfRelation
relation_type = relation.attrs.get("resourceRelationType")
if relation.find("dc"):
if description := relation.find_all("description"):
description = " ".join([d.get_text() for d in description])
related.append(
{
"relation_type": relation_type,
"href": soup_string(relation, "identifier"),
"name": soup_string(relation, "title"),
"resource_type": None,
"contributor": soup_string(relation, "contributor"),
"date": soup_string(relation, "date"),
"description": description,
}
)
else:
if description := relation.find("abstract"):
description = description.get_text()
related.append(
{
"relation_type": relation_type,
"href": relation.attrs.get("href"),
"name": soup_string(relation, "relationEntry"),
"resource_type": get_attr(relation, "relationEntry", "localType"),
"contributor": soup_string(relation, "name"),
"date": soup_string(relation, "date"),
"description": description,
}
)
return related
def get_biog(description):
biog = []
for bio in description.find_all("biogHist"):
for para in bio.find_all("p"):
biog.append(str(para.string).strip())
return " ".join(biog)
def get_sources(eac):
sources = []
for source_eac in eac.find_all("eac-cpf"):
source = process_eac(source_eac)
source["related_entities"] = get_related_entities(source_eac)
source["related_resources"] = get_related_resources(source_eac)
sources.append(source)
return sources
def get_agency_details(agency_element):
agency = {
"agency_id": soup_string(agency_element, "agencyCode"),
"agency_name": soup_string(agency_element, "agencyName"),
}
return agency
def get_eac_meta(eac):
meta = {"record_id": soup_string(eac, "recordId")}
control = eac.find("control")
# agency
meta.update(get_agency_details(control.find("maintenanceAgency")))
meta.update(get_dates(control.find("maintenanceHistory")))
return meta
def format_name(names, entity_type):
authorized = None
combined_names = []
for name in names:
if name["authorized"] is True:
authorized = name
break
if not authorized:
try:
authorized = names[0]
except IndexError:
pass
if authorized:
for name_type in ["forename", "surname", "name", "parent"]:
combined_names += authorized.get(name_type, [])
return " ".join(combined_names)
def process_eac(eac):
record = get_eac_meta(eac)
identity = eac.find("identity")
record["names"] = get_names(identity)
record["entity_type"] = soup_string(identity, "entityType")
record["entity_id"] = soup_string(identity, "entityId")
record["name"] = format_name(record["names"], record["entity_type"])
description = eac.find("description")
if not description:
description = eac.find("cpfDescription")
record["dates"] = get_exist_dates(description)
record["places"] = get_places(description)
record["occupations"] = get_occupations(description)
record["abstract"] = soup_string(description, "abstract")
record["description"] = get_biog(description)
record["events"] = get_events(description)
record["sources"] = get_sources(eac)
return record
def get_records(params):
records = []
response = s.get(api_url, params=params)
soup = BeautifulSoup(response.content, "xml")
for result in soup.find_all("record"):
eac = result.find("eac-cpf")
# get id info here
record = process_eac(eac)
record["trove_url"] = f"https://nla.gov.au/nla.party-{record['record_id']}"
records.append(record)
return records
def harvest_results(params):
records = []
total = get_total_results(params.copy())
start = 1
with tqdm(total=total) as pbar:
while start <= total:
params["start"] = start
new_records = get_records(params)
records += new_records
start += 100
pbar.update(len(new_records))
return records
search_params = params.copy()
search_params["query"] = "wragge"
results = harvest_results(search_params)
JSON(results)
# Test the processing code across the harvested data set
with Path("peau-data.xml").open("r") as xml_file:
for i, xml in enumerate(xml_file):
# if i < 100000:
soup = BeautifulSoup(xml, "xml")
eac = soup.find("eac-cpf")
try:
process_eac(eac)
except AttributeError:
print(soup.prettify())
raise
soup.decompose()
Created by Tim Sherratt for the GLAM Workbench.
The development of this notebook was supported by the Australian Cultural Data Engine.