This notebook harvests all the public tags that users have added to records in Trove. However, tags are being added all the time, so by the time you've finished harvesting, the dataset will probably be out of date.
You can access tags via the API by adding has:tags
to the query parameter to limit results to records with tags, and then adding the include=tags
parameter to include the tag data in each item record.
The harvest_tags()
function harvests all tags from the specified zone and writes them to a CSV file named according to the zone, for example, tags_newspaper.csv
.
Each CSV file contains the following columns:
tag
– the text tagdate
– date the tag was addedzone
– the Trove API zone (eg 'newspaper', 'book')record_id
– the id of the record to which the tag has been addedOnce the zone harvests are complete you can use this notebook to combine the separate CSV files, normalise the capitalisation of tags, and save the complete results into a single CSV file.
Some things to note:
work
or version
level. To simplify things, this code aggregates all tags at the work level, removing any duplicates.The complete dataset created by this notebook in July 2021 is available for download from CloudStor and Zenodo.
For some examples of how you might analyse and visualise the harvested tags, see this notebook.
import csv
import os
import time
from pathlib import Path
import pandas as pd
import requests_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv
# Insert your Trove API key between the quotes
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
api_url = "http://api.trove.nla.gov.au/v2/result"
# Set basic parameters
params = {
"q": "has:tags",
"include": "tags",
"encoding": "json",
"bulkHarvest": "true",
"n": 100,
"key": API_KEY,
}
# These types are needed to get data from API results
record_types = {
"newspaper": "article",
"gazette": "article",
"book": "work",
"article": "work",
"picture": "work",
"music": "work",
"map": "work",
"collection": "work",
"list": "list",
}
def get_total(cparams):
"""
This will enable us to make a nice progress bar...
"""
response = s.get(api_url, params=cparams)
data = response.json()
return int(data["response"]["zone"][0]["records"]["total"])
def get_tags_from_record(record):
"""
Extract tags from the supplied record.
Returns a list of tags.
Each tag is a list with two elements – value and date.
"""
tags = []
try:
for tag in record["tag"]:
tag_data = [tag.get("value"), tag.get("lastupdated")]
tags.append(tag_data)
except KeyError:
pass
return tags
def harvest_tags(zone):
"""
Harvest public tags from the specified zone.
Results are written to a CSV file.
"""
print(zone)
# article, work, or list
record_type = record_types[zone]
# Delete existing data file
Path(f"tags_{zone}.csv").unlink(missing_ok=True)
# Write column headings
with Path(f"tags_{zone}.csv").open("a") as tag_file:
writer = csv.writer(tag_file)
writer.writerow(["tag", "date", "zone", "record_id"])
start = "*"
cparams = params.copy()
cparams["zone"] = zone
# If it's a work, get versions as well
if record_type == "work":
cparams["include"] = "tags,workversions"
total = get_total(cparams)
with tqdm(total=total) as pbar:
while start is not None:
cparams["s"] = start
response = s.get(api_url, params=cparams)
data = response.json()
results = data["response"]["zone"][0]["records"]
# Get token for next page
try:
start = results["nextStart"]
# End of the result set
except KeyError:
start = None
with Path(f"tags_{zone}.csv").open("a") as tag_file:
writer = csv.writer(tag_file)
for record in results[record_type]:
tags = []
tags += get_tags_from_record(record)
# If there are versions loop through them gathering tags
if "version" in record:
for version in record["version"]:
tags += get_tags_from_record(version)
# Remove duplicate tags on work
tags = [list(t) for t in {tuple(tl) for tl in tags}]
#
if len(tags) == 0:
print(record)
# Add zone and record_id, then write to CSV
for tag in tags:
tag.append(zone)
tag.append(record["id"])
writer.writerow(tag)
pbar.update(len(results[record_type]))
if not response.from_cache:
time.sleep(0.2)
for zone in [
"newspaper",
"gazette",
"book",
"article",
"picture",
"music",
"map",
"collection",
"list",
]:
harvest_tags(zone)
dfs = []
for zone in [
"newspaper",
"gazette",
"book",
"article",
"picture",
"music",
"map",
"collection",
"list",
]:
dfs.append(pd.read_csv(f"tags_{zone}.csv"))
df = pd.concat(dfs)
df.head()
tag | date | zone | record_id | |
---|---|---|---|---|
0 | Stephen Guihen | 2013-03-24T02:30:11Z | newspaper | 100000011 |
1 | HICKEN Aberaham - Barellan | 2019-12-03T23:02:10Z | newspaper | 100000071 |
2 | WAIDE | 2014-04-02T06:39:06Z | newspaper | 100000098 |
3 | Pannowitz-Rhode wedding 1912 | 2013-02-28T00:47:10Z | newspaper | 100000101 |
4 | WAIDE | 2014-04-02T06:39:32Z | newspaper | 100000106 |
How many tagged items are there? (Note that this could include duplicates where items are available in multiple zones.)
df.shape
(9428937, 4)
How many unique tags are there?
df["tag"].nunique()
2201090
Cases are mixed in tags, although tag search in Trove is case-insensitive. Here we'll convert all the tags to lower-case, so we can aggregate them.
df["tag_normalised"] = df["tag"].str.lower()
To keep things compact, we'll drop the mixed-case tags and rename the new column.
# Remove the unnormalised tag column
df.drop(columns="tag", inplace=True)
# Rename the lowercase tag column
df.rename(columns={"tag_normalised": "tag"}, inplace=True)
Now let's save the complete, normalised dataset to a single CSV file.
# Reorder columns and save as CSV
df[["tag", "date", "zone", "record_id"]].to_csv("trove_tags_20220706.csv", index=False)
Created by Tim Sherratt for the GLAM Workbench. Support this project by becoming a GitHub sponsor.