This notebook examines what data is available via facets in DigitalNZ.
import requests
import requests_cache
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import FileLinks, display
from pathlib import Path
s = requests_cache.CachedSession()
API_KEY = '[YOUR API KEY]'
API_URL = 'http://api.digitalnz.org/v3/records.json'
def get_records(params):
'''
Get records from a search using the supplied parameters.
'''
response = s.get(API_URL, params=params)
return response.json()
def check_facet(facet):
'''
Get values for the specified facet, return the total number of values & records,
and save the complete set of values and counts as a CSV.
'''
facet_data = []
params = {
'facets': [facet],
'api_key': API_KEY,
'per_page': 0,
'facets_per_page': 350
}
data = get_records(params)
try:
facets = data['search']['facets'][facet]
except KeyError:
print('Not a facet!')
facet_data = {'facet': facet}
else:
# If there are more than 350 facet values, harvest them all
if len(facets) == 350:
facets = harvest_facet_values(facet)
# Convert the facet data to a dataframe
df = pd.DataFrame.from_dict(facets, orient='index').reset_index()
df.columns = ['value', 'count']
# Save all the values and counts as a CSV
df.to_csv(Path('facets', f'{facet}.csv'), index=False)
# Display summary details
print(f'Number of values: {df.shape[0]:,}')
print(f'Number of records: {df["count"].sum():,}')
# Return summary details
facet_data = {'facet': facet, 'num_values': df.shape[0], 'num_records': df['count'].sum()}
return facet_data
def harvest_facet_values(facet, **kwargs):
'''
Harvest all the available values for the given facet.
'''
facets = {}
more = True
page = 1
params = {
'api_key': API_KEY,
'per_page': 0,
'facets': facet,
'facets_per_page': 350,
}
for k, v in kwargs.items():
if k == 'text':
params[k] = v
else:
params[f'and[{k}][]'] = v
with tqdm(leave=False) as pbar:
while more:
params['facets_page'] = page
data = get_records(params)
if data['search']['facets'][facet]:
facets.update(data['search']['facets'][facet])
pbar.update(350)
page += 1
else:
more = False
return facets
The API docs say that the following facets are available via the API: category
, display_collection
, creator
, placename
, year
, decade
, century
, language
, content_partner
, rights
, collection
. However, display_collection
isn't available. It's also worth noting that the collection
facet corresponds to the collection_title
field.
After a bit of poking around, I found that facets are also available for usage
, copyright
, dc_type
, format
, subject
, and primary_collection
.
Let's gather values for each of the available facets.
facets = [
'category',
'display_collection',
'creator',
'placename',
'year',
'decade',
'century',
'language',
'content_partner',
'rights',
'collection',
'usage',
'copyright',
'dc_type',
'format',
'subject',
'primary_collection'
]
facet_data = []
for facet in facets:
print(f'\n{facet}')
facet_data.append(check_facet(facet))
We've now a dataset that summarises the contents of each facet. If you look in the facets
directory, you'll also find there's a CSV file containing all the values and counts for each facet.
Let's look at the summary data.
# Convert to a dataframe
df = pd.DataFrame(facet_data)
# Make sure counts are integers
df['num_values'] = df['num_values'].fillna(0.0).astype('int64')
df['num_records'] = df['num_records'].fillna(0.0).astype('int64')
df
Let's save this dataset as a CSV.
df.to_csv(Path('facets', 'facets.csv'), index=False)
Let's list all the CSV files we've saved!
display(FileLinks('facets', included_suffixes='.csv', recursive=False))
I'm not sure how strict the hierarchies are, but I'm assuming we should be able to connect content partners to collections.
I've used the results of this to visualise open collections in DigitalNZ.
partners = pd.read_csv(Path('facets', 'content_partner.csv'))
dfs = []
for row in partners.itertuples():
partner = row.value
facets = harvest_facet_values('primary_collection', content_partner=partner)
df = pd.DataFrame.from_dict(facets, orient='index').reset_index()
df.columns = ['primary_collection', 'count']
df['content_partner'] = partner
dfs.append(df)
df_collections = pd.concat(dfs)
df_collections = df_collections[['content_partner', 'primary_collection', 'count']].sort_values(by=['content_partner', 'primary_collection'])
df_collections.to_csv(Path('facets', 'collections_by_partner.csv'), index=False)
Created by Tim Sherratt for the GLAM Workbench.