Trove harvests details of programs and segments broadcast on ABC Radio National. You can find them by searching for nuc:"ABC:RN"
in the Music & Audio category. The records include basic metadata such as titles, dates, and contributors, but not full transcripts or audio.
This notebook harvests metadata describing ABCRN programs and segments using the Trove API. Note that there don't seem to have been any additions to the data since early 2022.
As of December 2023, there are 427,141 records (after removing duplicates) from about 163 programs (the actual number of programs is less than this, as the names used for some programs varies). See this notebook for some examples of how you can start exploring the data.
The harvested data is available in this GitHub repository. You can download the full dataset as a 340mb NDJSON file (with a separate JSON object for each record, separated by line breaks) and as a 216mb CSV file (with lists saved as pipe-separated strings).
For convenience, I've also created separate CSV files for the programs with the most records:
There's also a harvest from 2016 available in this repository.
Any of the fields other than work_id
and version_id
might be empty, though in most cases there should at least be values for title
, date
, creator
, contributor
and isPartOf
.
work_id
– identifier for the containing work in Trove (you can use this to create a url to the item)version_id
– an identifier for the version within the worktitle
– title for the program or segmentisPartOf
– name of the program this is a part ofdate
– ISO formatted datecreator
– usually just the ABCcontributor
– a list of names of those involved, such as the host, reporter or guesttype
– list of typesformat
– list of formatsabstract
– text providing a summary of the program or segment (may incude multiple values)fulltext_url
– link to the page on the ABC website where you can find more informationthumbnail_url
– link to a related thumbnail image on the ABC websitenotonline_url
– not sure...import json
import os
from datetime import datetime
from pathlib import Path
import pandas as pd
import requests_cache
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
load_dotenv()
True
# Insert your Trove API key
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
def get_total(params):
params["n"] = 0
response = s.get("https://api.trove.nla.gov.au/v3/result", params=params)
data = response.json()
return int(data["category"][0]["records"]["total"])
def get_metadata_source(record):
try:
source = record["metadataSource"]["value"]
except TypeError:
source = record["metadataSource"]
return source
def extract_values(value, key="value"):
"""
Some fields mix dicts and lists. Try to extract values from dicts and return only lists.
"""
values = []
value_list = [v for v in value if v]
for v in value_list:
try:
values.append(v[key].strip())
except (TypeError, KeyError):
values.append(v.strip())
return values
def get_links(identifiers):
"""
Flatten the identifiers list of dicts into a dict with linktype as key.
"""
links = {}
for link in identifiers:
try:
links[f'{link["linktype"]}_url'] = link["value"]
except (TypeError, KeyError):
pass
return links
def harvest(output_file=None, year=None):
Path("data").mkdir(exist_ok=True)
if not output_file:
output_file = f'abcrn-{datetime.now().strftime("%Y%m%d")}.ndjson'
output_file = Path("data", output_file)
params = {
"q": 'nuc:"ABC:RN"',
"category": "music",
"include": "workversions",
"n": 100,
"bulkHarvest": "true",
"encoding": "json",
"key": API_KEY,
}
if year:
params["l-year"] = year
params["l-decade"] = year[:3]
start = "*"
total = get_total(params.copy())
with output_file.open("w") as data_file:
with tqdm(total=total) as pbar:
while start:
params["s"] = start
response = s.get(
"https://api.trove.nla.gov.au/v3/result", params=params
)
data = response.json()
# Loop through the work records
records = data["category"][0]["records"]["work"]
for record in records:
# Now loop through the version records
for version in record["version"]:
# Sometimes versions can themselves contain multiple records and ids
# First we'll try splitting the ids in case there are multiple values
ids = version["id"].split()
# Then we'll try looping through any sub-version records
for i, subr in enumerate(version["record"]):
# Get the metadata source so we can filter out any records we don't want
subv = subr["metadata"]["dc"]
source = get_metadata_source(subr)
if source == "ABC:RN":
# Add work id to the record
metadata = {
"work_id": record["id"],
"version_id": ids[i],
"title": extract_values(subv["title"]),
"date": extract_values(subv["date"]),
"isPartOf": extract_values(subv["isPartOf"]),
"creator": extract_values(
subv["creator"], key="name"
),
"contributor": extract_values(subv["contributor"]),
"abstract": extract_values(subv["abstract"]),
"type": extract_values(subv["type"]),
"format": extract_values(subv["format"]),
}
# Get links by flattening the identifiers field and add to record
links = get_links(subv["identifier"])
metadata.update(links)
# remove unnecessary identifiers field
data_file.write(f"{json.dumps(metadata)}\n")
try:
start = data["category"][0]["records"]["nextStart"]
except KeyError:
start = None
pbar.update(len(records))
output_file = f'abcrn-{datetime.now().strftime("%Y%m%d")}.ndjson'
harvest(output_file=output_file)
0%| | 0/438838 [00:00<?, ?it/s]
How many records have we harvested? Let's load the ndjson
file into a dataframe and explore.
# The lines param tells pandas there's one JSON object per line.
df = pd.read_json(Path("data", output_file), lines=True)
df.head()
work_id | version_id | title | date | isPartOf | creator | contributor | abstract | type | format | fulltext_url | thumbnail_url | notonline_url | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14882967 | 195385238 | [RU 486] | [1997-09-22] | [ABC Radio National. Health Report] | [Australian Broadcasting Corporation. Radio Na... | [Dr Norman Swan] | [What politicians believe is good for women's ... | [Sound, Transcript, Radio Broadcast] | [text/html, Transcript] | http://www.abc.net.au/radionational/programs/h... | http://www.abc.net.au/radionational/image/3699... | NaN |
1 | 151422764 | 195400866 | [Copyright and the courts] | [2011-05-12] | [ABC Radio National. Law Report] | [Australian Broadcasting Corporation. Radio Na... | [David, Sabiene Heindl, Jock Given, Ross Steve... | [There's an on-going courtroom war between cop... | [Sound, Transcript, Radio Broadcast] | [Audio, Transcript] | http://www.abc.net.au/radionational/programs/l... | http://www.abc.net.au/radionational/image/3699... | NaN |
2 | 15426408 | 206893518 | [The Law Report] | [2014-03-25] | [ABC Radio National. RN Breakfast] | [Australian Broadcasting Corporation. Radio Na... | [Damien Carrick, Fran Kelly] | [Disability rights lawyer and endurance athlet... | [Sound, Transcript, Radio Broadcast] | [text/html] | http://www.abc.net.au/radionational/programs/b... | http://www.abc.net.au/radionational/image/3699... | NaN |
3 | 15426408 | 206591783 | [The Law Report] | [2014-02-11] | [ABC Radio National. RN Breakfast] | [Australian Broadcasting Corporation. Radio Na... | [Damien Carrick, Fran Kelly] | [Professor Andrew Ashworth, one of the United ... | [Sound, Transcript, Radio Broadcast] | [text/html] | http://www.abc.net.au/radionational/programs/b... | http://www.abc.net.au/radionational/image/3699... | NaN |
4 | 156082218 | 209405411 | [East Timor Since Independence] | [2006-06-29] | [ABC Radio National. Rear Vision] | [Australian Broadcasting Corporation. Radio Na... | [Dr Dennis Shoesmith, Rob Wesley Smith, James ... | [What has happened in East Timor since indepen... | [Text, Transcript, Radio Broadcast] | [Audio] | http://www.abc.net.au/radionational/programs/r... | http://www.abc.net.au/radionational/image/3699... | NaN |
df.shape
(484078, 13)
Most of the fields contains lists of values (though there may only be one value in the list). To check for duplicates we need to convert these lists into strings. If there are multiple values, they'll be separated by a pipe (|
) character.
cols_with_lists = [
"title",
"isPartOf",
"date",
"creator",
"contributor",
"type",
"format",
"abstract",
]
for col in cols_with_lists:
df[col] = df[col].str.join("|")
You'd expect the combination of title
, date
, and program (in the isPartOf
field) to be unique – let's see.
df.loc[df.duplicated(subset=("title", "date", "isPartOf"))].shape
(56937, 13)
There seem to be quite a lot of duplicates! Let's remove the duplicates based on the title
, date
, and isPartOf
fields. By adding fulltext_url
to the sort, I'm hoping to drop the duplicates without urls (by default drop_duplicates
keeps the first version of a duplicated record).
df = df.sort_values(by=["title", "date", "fulltext_url"]).drop_duplicates(
subset=["title", "date", "isPartOf"]
)
Now how many do we have?
df.shape
(427141, 13)
csv_file = Path("data", f'abcrn-{datetime.now().strftime("%Y%m%d")}.csv')
df.to_csv(csv_file, index=False)
Here's the programs with the most records. Note that some like RN Breakfast are split between two isPartOf
values, 'ABC Radio National. RN Breakfast' and 'ABC Radio. RN Breakfast'.
df["isPartOf"].value_counts()[:20]
ABC Radio National. RN Breakfast 63676 ABC Radio. AM 56009 ABC Radio. The World Today 51643 ABC Radio. PM 51231 ABC Radio. RN Breakfast 19877 ABC Radio National. RN Drive 13785 ABC Radio. RN Drive 12758 ABC Radio National. Late Night Live 10689 ABC Radio National. Life Matters 10658 ABC Radio. AM Archive 9825 ABC Radio. PM Archive 8430 ABC Radio National. The Science Show 8020 ABC Radio. The World Today Archive 7902 ABC Radio National. Saturday Extra 6545 ABC Radio National. Health Report 5041 ABC Radio 4638 ABC Radio National. Counterpoint 4472 ABC Radio National. Sunday Extra 4246 ABC Radio. Correspondents Report 4005 ABC Radio National. AWAYE! 3456 Name: isPartOf, dtype: int64
Let's save the programs with the most records as separate CSV files to make them a bit easier to work with. We'll also group together programs with multiple isPartOf
values.
programs = {
"breakfast": ["ABC Radio National. RN Breakfast", "ABC Radio. RN Breakfast"],
"am": ["ABC Radio. AM", "ABC Radio. AM Archive"],
"pm": ["ABC Radio. PM", "ABC Radio. PM Archive"],
"world_today": ["ABC Radio. The World Today", "ABC Radio. The World Today Archive"],
"drive": ["ABC Radio. RN Drive", "ABC Radio National. RN Drive"],
"latenight": ["ABC Radio National. Late Night Live"],
"lifematters": ["ABC Radio National. Life Matters"],
"scienceshow": ["ABC Radio National. The Science Show"],
}
for program, labels in programs.items():
dfp = df.loc[df["isPartOf"].isin(labels)].sort_values(by=["date", "title"])
csv_file = Path("data", f'{program}-{datetime.now().strftime("%Y%m%d")}.csv')
dfp.to_csv(csv_file, index=False)
# This is just a small automated test used in development
# You can ignore this cell
if os.getenv("GW_STATUS") == "dev":
output_file = f'abcrn-test-{datetime.now().strftime("%Y%m%d")}.ndjson'
output_path = Path("data", output_file)
harvest(output_file=output_file, year="2022")
df = pd.read_json(output_path, lines=True)
assert df.empty is False
output_path.unlink()
0%| | 0/403 [00:00<?, ?it/s]
Created by Tim Sherratt for the GLAM Workbench