This notebook pokes around at the top-level of DigitalNZ, mainly using facets.
See the API documentation for more detailed information.
If you haven't used one of these notebooks before, they're basically web pages in which you can write, edit, and run live code. They're meant to encourage experimentation, so don't feel nervous. Just try running a few cells and see what happens!.
Some tips:
import requests
import pandas as pd
import altair as alt
from IPython.display import display, HTML
Get yourself an API key and paste it between the quotes below.
api_key = '[YOUR API KEY]'
print('Your API key is: {}'.format(api_key))
# Base url for queries
api_search_url = 'http://api.digitalnz.org/v3/records.json'
# Set up the query params (we'll change these later)
# Let's start with an empty text query to look at everything
def set_params():
params = {
'api_key': api_key,
'text': ''
}
return params
def get_data(params):
'''
Retrieve an API query and extract the JSON payload.
'''
response = requests.get(api_search_url, params=params)
return response.json()
# How many items are there?
params = set_params()
data = get_data(params)
print(' There are {:,} items'.format(data['search']['result_count']))
There are 32,111,791 items
params['facets'] = 'century'
data = get_data(params)
centuries = data['search']['facets']['century']
centuries_df = pd.Series(centuries).to_frame().reset_index()
centuries_df.columns = ['century', 'count']
centuries_df
century | count | |
---|---|---|
0 | 1900 | 17209636 |
1 | 1800 | 11159985 |
2 | 2000 | 2482630 |
3 | 1700 | 6087 |
4 | 1600 | 2782 |
5 | 1400 | 1109 |
6 | 1300 | 1014 |
7 | 1500 | 606 |
8 | 600 | 542 |
9 | 700 | 388 |
c1 = alt.Chart(centuries_df).mark_bar().encode(
x = 'century:O',
y = 'count:Q',
tooltip = alt.Tooltip('count', format=',')
)
c2 = alt.Chart(centuries_df).mark_bar().encode(
x = 'century:O',
y = alt.Y('count:Q',
scale=alt.Scale(type='log')),
tooltip = alt.Tooltip('count', format=',')
)
c1 | c2
params['facets'] = 'decade'
params['facets_per_page'] = 25
data = get_data(params)
decades = data['search']['facets']['decade']
decades_df = pd.Series(decades).to_frame().reset_index()
decades_df.columns = ['decade', 'count']
decades_df.head()
decade | count | |
---|---|---|
0 | 1900 | 6464371 |
1 | 1910 | 6178640 |
2 | 1890 | 4758678 |
3 | 1880 | 3663331 |
4 | 1870 | 1844200 |
alt.Chart(decades_df).mark_bar().encode(
x = 'decade:O',
y = 'count:Q',
tooltip = alt.Tooltip('count', format=',')
)
params['facets'] = 'display_collection'
params['facets_per_page'] = 26
data = get_data(params)
# Note that the facet is called 'primary_collection' in the results!
collections = data['search']['facets']['primary_collection']
collections_df = pd.Series(collections).to_frame().reset_index()
collections_df.columns = ['collection', 'count']
collections_df.head()
collection | count | |
---|---|---|
0 | Papers Past | 26122911 |
1 | Radio New Zealand | 778363 |
2 | iNaturalist NZ — Mātaki Taiao | 571510 |
3 | TAPUHI | 338051 |
4 | Auckland Libraries Heritage Images Collection | 267112 |
Papers Past is so much bigger than anything else, let's exclude it from the chart.
alt.Chart(collections_df[1:]).mark_bar().encode(
x=alt.X('count:Q'),
y=alt.Y('collection:N'),
tooltip = alt.Tooltip('count', format=',')
)
more = True
all_collections = {}
params['facets'] = 'display_collection'
params['facets_per_page'] = 100
params['facets_page'] = 1
while more:
data = get_data(params)
facets = data['search']['facets']['primary_collection']
if facets:
all_collections.update(facets)
params['facets_page'] += 1
else:
more = False
all_collections_df = pd.Series(all_collections).to_frame().reset_index()
all_collections_df.columns = ['collection', 'count']
all_collections_df.head()
collection | count | |
---|---|---|
0 | Papers Past | 26122911 |
1 | Radio New Zealand | 778363 |
2 | iNaturalist NZ — Mātaki Taiao | 571510 |
3 | TAPUHI | 338051 |
4 | Auckland Libraries Heritage Images Collection | 267112 |
all_collections_df.to_csv('digitalnz_collections.csv', index=False)
display(HTML('<a href="digitalnz_collections.csv" download>Download CSV file</a>'))
params['facets'] = 'collection'
params['and[display_collection][]'] = 'Papers Past'
params['facets_per_page'] = 26
params['facets_page'] = 1
data = get_data(params)
newspapers = data['search']['facets']['collection']
newspapers_df = pd.Series(newspapers).to_frame().reset_index()
newspapers_df.columns = ['newspaper', 'count']
newspapers_df.head()
newspaper | count | |
---|---|---|
0 | Papers Past | 26122911 |
1 | Evening Post | 3772941 |
2 | Otago Daily Times | 1583125 |
3 | Wanganui Chronicle | 1163217 |
4 | Hawera & Normanby Star | 1075326 |
alt.Chart(newspapers_df[1:]).mark_bar().encode(
x=alt.X('count:Q'),
y=alt.Y('newspaper:N'),
tooltip = alt.Tooltip('count', format=',')
)
more = True
all_newspapers = {}
params['facets'] = 'collection'
params['and[display_collection][]'] = 'Papers Past'
params['facets_per_page'] = 100
params['facets_page'] = 1
while more:
data = get_data(params)
facets = data['search']['facets']['collection']
if facets:
all_newspapers.update(facets)
params['facets_page'] += 1
else:
more = False
all_newspapers_df = pd.Series(all_newspapers).to_frame().reset_index()
all_newspapers_df.columns = ['newspaper', 'count']
all_newspapers_df.head()
newspaper | count | |
---|---|---|
0 | Papers Past | 26122911 |
1 | Evening Post | 3772941 |
2 | Otago Daily Times | 1583125 |
3 | Wanganui Chronicle | 1163217 |
4 | Hawera & Normanby Star | 1075326 |
all_newspapers_df[1:].to_csv('paperspast_newspapers.csv', index=False)
display(HTML('<a href="paperspast_newspapers.csv" download>Download CSV file</a>'))
Created by Tim Sherratt for the GLAM Workbench. Support this project by becoming a GitHub sponsor.