This notebook examines what data is available via facets in DigitalNZ.
import requests
import requests_cache
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import FileLinks, display
from pathlib import Path
s = requests_cache.CachedSession()
API_KEY = '[YOUR API KEY]'
API_URL = 'http://api.digitalnz.org/v3/records.json'
def get_records(params):
'''
Get records from a search using the supplied parameters.
'''
response = s.get(API_URL, params=params)
return response.json()
def check_facet(facet):
'''
Get values for the specified facet, return the total number of values & records,
and save the complete set of values and counts as a CSV.
'''
facet_data = []
params = {
'facets': [facet],
'api_key': API_KEY,
'per_page': 0,
'facets_per_page': 350
}
data = get_records(params)
try:
facets = data['search']['facets'][facet]
except KeyError:
print('Not a facet!')
facet_data = {'facet': facet}
else:
# If there are more than 350 facet values, harvest them all
if len(facets) == 350:
facets = harvest_facet_values(facet)
# Convert the facet data to a dataframe
df = pd.DataFrame.from_dict(facets, orient='index').reset_index()
df.columns = ['value', 'count']
# Save all the values and counts as a CSV
df.to_csv(Path('facets', f'{facet}.csv'), index=False)
# Display summary details
print(f'Number of values: {df.shape[0]:,}')
print(f'Number of records: {df["count"].sum():,}')
# Return summary details
facet_data = {'facet': facet, 'num_values': df.shape[0], 'num_records': df['count'].sum()}
return facet_data
def harvest_facet_values(facet, **kwargs):
'''
Harvest all the available values for the given facet.
'''
facets = {}
more = True
page = 1
params = {
'api_key': API_KEY,
'per_page': 0,
'facets': facet,
'facets_per_page': 350,
}
for k, v in kwargs.items():
if k == 'text':
params[k] = v
else:
params[f'and[{k}][]'] = v
with tqdm(leave=False) as pbar:
while more:
params['facets_page'] = page
data = get_records(params)
if data['search']['facets'][facet]:
facets.update(data['search']['facets'][facet])
pbar.update(350)
page += 1
else:
more = False
return facets
The API docs say that the following facets are available via the API: category
, display_collection
, creator
, placename
, year
, decade
, century
, language
, content_partner
, rights
, collection
. However, display_collection
isn't available. It's also worth noting that the collection
facet corresponds to the collection_title
field.
After a bit of poking around, I found that facets are also available for usage
, copyright
, dc_type
, format
, subject
, and primary_collection
.
Let's gather values for each of the available facets.
facets = [
'category',
'display_collection',
'creator',
'placename',
'year',
'decade',
'century',
'language',
'content_partner',
'rights',
'collection',
'usage',
'copyright',
'dc_type',
'format',
'subject',
'primary_collection'
]
facet_data = []
for facet in facets:
print(f'\n{facet}')
facet_data.append(check_facet(facet))
category Number of values: 19 Number of records: 32,126,494 display_collection Not a facet! creator
0it [00:00, ?it/s]
Number of values: 333,529 Number of records: 3,554,778 placename
0it [00:00, ?it/s]
Number of values: 216,151 Number of records: 26,365,644 year
0it [00:00, ?it/s]
Number of values: 974 Number of records: 31,241,963 decade Number of values: 279 Number of records: 30,953,700 century Number of values: 78 Number of records: 30,866,245 language Number of values: 235 Number of records: 24,649,319 content_partner Number of values: 215 Number of records: 32,114,054 rights
0it [00:00, ?it/s]
Number of values: 45,413 Number of records: 29,978,417 collection
0it [00:00, ?it/s]
Number of values: 25,015 Number of records: 60,325,343 usage Number of values: 5 Number of records: 81,707,236 copyright Number of values: 33 Number of records: 31,990,162 dc_type
0it [00:00, ?it/s]
Number of values: 3,237 Number of records: 2,099,860 format
0it [00:00, ?it/s]
Number of values: 80,228 Number of records: 2,770,227 subject
0it [00:00, ?it/s]
Number of values: 1,019,286 Number of records: 13,659,843 primary_collection Number of values: 315 Number of records: 32,113,360
We've now a dataset that summarises the contents of each facet. If you look in the facets
directory, you'll also find there's a CSV file containing all the values and counts for each facet.
Let's look at the summary data.
# Convert to a dataframe
df = pd.DataFrame(facet_data)
# Make sure counts are integers
df['num_values'] = df['num_values'].fillna(0.0).astype('int64')
df['num_records'] = df['num_records'].fillna(0.0).astype('int64')
df
facet | num_values | num_records | |
---|---|---|---|
0 | category | 19 | 32126494 |
1 | display_collection | 0 | 0 |
2 | creator | 333529 | 3554778 |
3 | placename | 216151 | 26365644 |
4 | year | 974 | 31241963 |
5 | decade | 279 | 30953700 |
6 | century | 78 | 30866245 |
7 | language | 235 | 24649319 |
8 | content_partner | 215 | 32114054 |
9 | rights | 45413 | 29978417 |
10 | collection | 25015 | 60325343 |
11 | usage | 5 | 81707236 |
12 | copyright | 33 | 31990162 |
13 | dc_type | 3237 | 2099860 |
14 | format | 80228 | 2770227 |
15 | subject | 1019286 | 13659843 |
16 | primary_collection | 315 | 32113360 |
Let's save this dataset as a CSV.
df.to_csv(Path('facets', 'facets.csv'), index=False)
Let's list all the CSV files we've saved!
display(FileLinks('facets', included_suffixes='.csv', recursive=False))
I'm not sure how strict the hierarchies are, but I'm assuming we should be able to connect content partners to collections.
I've used the results of this to visualise open collections in DigitalNZ.
partners = pd.read_csv(Path('facets', 'content_partner.csv'))
dfs = []
for row in partners.itertuples():
partner = row.value
facets = harvest_facet_values('primary_collection', content_partner=partner)
df = pd.DataFrame.from_dict(facets, orient='index').reset_index()
df.columns = ['primary_collection', 'count']
df['content_partner'] = partner
dfs.append(df)
df_collections = pd.concat(dfs)
df_collections = df_collections[['content_partner', 'primary_collection', 'count']].sort_values(by=['content_partner', 'primary_collection'])
df_collections.to_csv(Path('facets', 'collections_by_partner.csv'), index=False)
Created by Tim Sherratt for the GLAM Workbench. Support this project by becoming a GitHub sponsor.