Harvest GLAM datasets from data.gov.au

This is a quick attempt to harvest datasets published by GLAM institutions using the new data.gov.au API.

To create the list of organisations, I searched the organisations on the data.gov.au site for 'library', 'archives', 'records', and 'museum'. I noticed that Queensland State Archives isn't included as an organisation, even though it's used as a tag, so I added it in as a query. There are inconsistencies in the way organisations are listed, so it's possible I've missed some.

In [1]:
import requests
import json
from json import JSONDecodeError
import pandas as pd
from urllib.parse import urlparse
from IPython.display import display, FileLink
import os
import re
import time
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
In [5]:
api_url = 'https://data.gov.au/api/v0/search/datasets'
organisations = [
    'NSW State Archives',
    'National Archives of Australia',
    'Libraries Tasmania',
    'State Records',
    'State Records Office of Western Australia',
    'State Library of Victoria',
    'State Library of NSW',
    'Mount Gambier Library',
    'National Library of Australia',
    'State Library of Queensland',
    'State Library of Western Australia',
    'State Library of South Australia',
    'State Library of New South Wales',
    'Western Australian Museum',
    'South Australian Museum',
    'Museum of Applied Arts and Sciences',
    'Tasmanian Museum and Art Gallery',
    'History Trust of South Australia',
    'Australian Institute of Aboriginal and Torres Strait Islander Studies (AIATSIS)',
    'National Portrait Gallery',
    'Australian Museum'
]
# No entries under organisations
queries = [
    '"Queensland State Archives"',
    'PROV Public Record Office'
]
In [6]:
def safe_get(dct, *keys):
    for key in keys:
        try:
            dct = dct[key]
        except (KeyError, TypeError):
            return None
    return dct

def process_dataset(dataset, query=None):
    datafiles = []
    for dist in dataset['distributions']:
        if query:
            publisher = query.strip('"')
        else:
            publisher = safe_get(dataset, 'publisher', 'name')
        datafile = {
            'dataset_title': safe_get(dataset, 'title'),
            'publisher': publisher,
            'dataset_issued': safe_get(dataset, 'issued'),
            'dataset_modified': safe_get(dataset, 'modified'),
            'dataset_description': safe_get(dataset, 'description'),
            'source': safe_get(dataset, 'catalog'),
            'info_url': safe_get(dataset, 'landingPage'),
            'start_date': safe_get(dataset, 'temporal', 'start', 'date'),
            'end_date': safe_get(dataset, 'temporal', 'end', 'date'),
            'file_title': safe_get(dist, 'title'),
            'download_url': safe_get(dist, 'downloadURL'),
            'format': safe_get(dist, 'format'),
            'file_description': safe_get(dist, 'description'),
            'file_issued': safe_get(dist, 'issued'),
            'file_modified': safe_get(dist, 'modified'),
            'licence': safe_get(dist, 'license', 'name')
        }
        datafiles.append(datafile)
    return datafiles

def harvest_datasets():
    datafiles = []
    for organisation in organisations:
        response = s.get(api_url, params={'publisher': organisation, 'limit': 100})
        print(response.url)
        data = response.json()
        for dataset in data['dataSets']:
            datafiles += process_dataset(dataset)
    for query in queries:
        response = s.get(api_url, params={'query': query, 'limit': 100})
        print(response.url)
        data = response.json()
        for dataset in data['dataSets']:
            datafiles += process_dataset(dataset, query=query)
    return datafiles
In [8]:
datafiles = harvest_datasets()
https://data.gov.au/api/v0/search/datasets?publisher=NSW+State+Archives&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=National+Archives+of+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Libraries+Tasmania&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Records&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Records+Office+of+Western+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Victoria&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+NSW&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Mount+Gambier+Library&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=National+Library+of+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Queensland&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Western+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+South+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+New+South+Wales&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Western+Australian+Museum&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=South+Australian+Museum&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Museum+of+Applied+Arts+and+Sciences&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Tasmanian+Museum+and+Art+Gallery&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=History+Trust+of+South+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Australian+Institute+of+Aboriginal+and+Torres+Strait+Islander+Studies+%28AIATSIS%29&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=National+Portrait+Gallery&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Australian+Museum&limit=100
https://data.gov.au/api/v0/search/datasets?query=%22Queensland+State+Archives%22&limit=100
https://data.gov.au/api/v0/search/datasets?query=PROV+Public+Record+Office&limit=100
In [9]:
df = pd.DataFrame(datafiles)
df.head()
Out[9]:
dataset_description dataset_issued dataset_modified dataset_title download_url end_date file_description file_issued file_modified file_title format info_url licence publisher source start_date
0 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see: [Closer S... 2013-05-28T01:08:57Z None Closer Settlement Promotion files, 1913- CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
1 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see this page:... 2013-05-28T01:09:42Z None Closer Settlement Transfer Registers, Jul 1919... CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
2 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see this page:... 2013-05-28T01:10:39Z None Closer Settlement and Returned Soldier’s Trans... CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
3 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see this page:... 2013-05-28T01:11:22Z None Registers of Settlement Purchase, 1905-1929 CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
4 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see this page:... 2013-05-28T01:12:01Z None Returned Soldier Settlement Miscellaneous Files CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
In [10]:
df.shape
Out[10]:
(948, 16)
In [11]:
df['format'].value_counts()
Out[11]:
CSV           586
XML            81
JSON           74
XLSX           61
DOCX           34
HTML           33
PLAIN          14
ZIP            14
API             9
GEOJSON         8
DATA            6
OTHER           4
PDF             4
KML             3
RSS             2
JPEG            2
RDF             1
HMTL            1
WFS             1
TXT             1
APP             1
JAVASCRIPT      1
CSS             1
WMS             1
PAGE            1
Name: format, dtype: int64
In [12]:
df['licence'].value_counts()
Out[12]:
Creative Commons Attribution                       250
Creative Commons Attribution 3.0 Australia         244
Creative Commons Attribution 4.0                   237
Creative Commons Attribution 4.0 International     146
Creative Commons Attribution 2.5 Australia          32
Creative Commons Attribution-NonCommercial          10
Other (Open)                                         5
notspecified                                         5
Creative Commons Attribution Share-Alike 4.0         3
Creative Commons Attribution 3.0                     3
Creative Commons Attribution Non-Commercial 4.0      2
Custom (Other)                                       1
Name: licence, dtype: int64
In [13]:
df['publisher'].value_counts()
Out[13]:
State Library of Queensland                                                        204
Queensland State Archives                                                          172
State Library of Western Australia                                                 147
State Library of South Australia                                                   140
Libraries Tasmania                                                                  71
State Records                                                                       41
PROV Public Record Office                                                           33
South Australian Museum                                                             33
State Library of New South Wales                                                    21
NSW State Archives                                                                  19
History Trust of South Australia                                                    19
State Records Office of Western Australia                                            7
State Library of NSW                                                                 6
Western Australian Museum                                                            6
National Library of Australia                                                        5
State Library of Victoria                                                            5
Australian Museum                                                                    4
Australian Institute of Aboriginal and Torres Strait Islander Studies (AIATSIS)      3
National Archives of Australia                                                       3
Museum of Applied Arts and Sciences                                                  3
Mount Gambier Library                                                                2
Tasmanian Museum and Art Gallery                                                     2
National Portrait Gallery                                                            2
Name: publisher, dtype: int64
In [14]:
df.to_csv('glam_datasets_all_formats_from_datagovau.csv', index=False)
display(FileLink('glam_datasets_all_formats_from_datagovau.csv'))
In [15]:
csvs = df.loc[df['format'] == 'CSV']
In [16]:
csvs.shape
Out[16]:
(586, 16)
In [17]:
csvs['publisher'].value_counts()
Out[17]:
State Library of Queensland                                                        185
Queensland State Archives                                                          122
State Library of Western Australia                                                  76
State Library of South Australia                                                    72
State Records                                                                       29
PROV Public Record Office                                                           26
Libraries Tasmania                                                                  23
NSW State Archives                                                                  17
State Library of New South Wales                                                    10
South Australian Museum                                                              8
History Trust of South Australia                                                     6
State Library of Victoria                                                            5
State Records Office of Western Australia                                            3
Australian Institute of Aboriginal and Torres Strait Islander Studies (AIATSIS)      2
Mount Gambier Library                                                                1
National Archives of Australia                                                       1
Name: publisher, dtype: int64
In [18]:
csvs.to_csv('glam_datasets_csvs_from_datagovau.csv', index=False)
display(FileLink('glam_datasets_csvs_from_datagovau.csv'))
In [19]:
# Write results to a markdown file
from slugify import slugify
orgs = df.sort_values(by=['publisher', 'dataset_title', 'dataset_modified']).groupby('publisher')
with open('glam_datasets_from_datagovau.md', 'w') as md_file:
    for org, group in orgs:
        print('* [{}](#{})'.format(org, slugify(org)))
        md_file.write('\n## {}\n'.format(org))
        for dataset, files in group.groupby(['dataset_title', 'info_url']):
            md_file.write('\n### [{}]({})\n'.format(dataset[0], dataset[1]))
            for row in files.itertuples():
                md_file.write('* [{}]({}) ({}, {})\n'.format(row.file_title, row.download_url, row.format, row.file_issued))
    
* [Australian Institute of Aboriginal and Torres Strait Islander Studies (AIATSIS)](#australian-institute-of-aboriginal-and-torres-strait-islander-studies-aiatsis)
* [Australian Museum](#australian-museum)
* [History Trust of South Australia](#history-trust-of-south-australia)
* [Libraries Tasmania](#libraries-tasmania)
* [Mount Gambier Library](#mount-gambier-library)
* [Museum of Applied Arts and Sciences](#museum-of-applied-arts-and-sciences)
* [NSW State Archives](#nsw-state-archives)
* [National Archives of Australia](#national-archives-of-australia)
* [National Library of Australia](#national-library-of-australia)
* [National Portrait Gallery](#national-portrait-gallery)
* [PROV Public Record Office](#prov-public-record-office)
* [Queensland State Archives](#queensland-state-archives)
* [South Australian Museum](#south-australian-museum)
* [State Library of NSW](#state-library-of-nsw)
* [State Library of New South Wales](#state-library-of-new-south-wales)
* [State Library of Queensland](#state-library-of-queensland)
* [State Library of South Australia](#state-library-of-south-australia)
* [State Library of Victoria](#state-library-of-victoria)
* [State Library of Western Australia](#state-library-of-western-australia)
* [State Records](#state-records)
* [State Records Office of Western Australia](#state-records-office-of-western-australia)
* [Tasmanian Museum and Art Gallery](#tasmanian-museum-and-art-gallery)
* [Western Australian Museum](#western-australian-museum)
In [ ]: