Harvest GLAM datasets from data.gov.au

This is a quick attempt to harvest datasets published by GLAM institutions using the new data.gov.au API.

To create the list of organisations, I searched the organisations on the data.gov.au site for 'library', 'archives', 'records', and 'museum'. I noticed that Queensland State Archives isn't included as an organisation, even though it's used as a tag, so I added it in as a query. There are inconsistencies in the way organisations are listed, so it's possible I've missed some.

In [1]:
import requests
import json
from json import JSONDecodeError
import pandas as pd
from urllib.parse import urlparse
from IPython.display import display, FileLink
import os
import re
import time
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
In [2]:
api_url = 'https://data.gov.au/api/v0/search/datasets'
organisations = [
    'NSW State Archives',
    'National Archives of Australia',
    'Libraries Tasmania',
    'State Records',
    'State Records Office of Western Australia',
    'State Library of Victoria',
    'State Library of NSW',
    'Mount Gambier Library',
    'National Library of Australia',
    'State Library of Queensland',
    'State Library of Western Australia',
    'State Library of South Australia',
    'State Library of New South Wales',
    'Western Australian Museum',
    'South Australian Museum',
    'Museum of Applied Arts and Sciences',
    'Tasmanian Museum and Art Gallery',
    'History Trust of South Australia'
]
# No entries under organisations
queries = [
    '"Queensland State Archives"'
]
In [14]:
def safe_get(dct, *keys):
    for key in keys:
        try:
            dct = dct[key]
        except (KeyError, TypeError):
            return None
    return dct

def process_dataset(dataset, query=None):
    datafiles = []
    for dist in dataset['distributions']:
        if query:
            publisher = query.strip('"')
        else:
            publisher = safe_get(dataset, 'publisher', 'name')
        datafile = {
            'dataset_title': safe_get(dataset, 'title'),
            'publisher': publisher,
            'dataset_issued': safe_get(dataset, 'issued'),
            'dataset_modified': safe_get(dataset, 'modified'),
            'dataset_description': safe_get(dataset, 'description'),
            'source': safe_get(dataset, 'catalog'),
            'info_url': safe_get(dataset, 'landingPage'),
            'start_date': safe_get(dataset, 'temporal', 'start', 'date'),
            'end_date': safe_get(dataset, 'temporal', 'end', 'date'),
            'file_title': safe_get(dist, 'title'),
            'download_url': safe_get(dist, 'downloadURL'),
            'format': safe_get(dist, 'format'),
            'file_description': safe_get(dist, 'description'),
            'file_issued': safe_get(dist, 'issued'),
            'file_modified': safe_get(dist, 'modified'),
            'licence': safe_get(dist, 'license', 'name')
        }
        datafiles.append(datafile)
    return datafiles

def harvest_datasets():
    datafiles = []
    for organisation in organisations:
        response = s.get(api_url, params={'publisher': organisation, 'limit': 100})
        print(response.url)
        data = response.json()
        for dataset in data['dataSets']:
            datafiles += process_dataset(dataset)
    for query in queries:
        response = s.get(api_url, params={'query': query, 'limit': 100})
        print(response.url)
        data = response.json()
        for dataset in data['dataSets']:
            datafiles += process_dataset(dataset, query=query)
    return datafiles
In [15]:
datafiles = harvest_datasets()
https://data.gov.au/api/v0/search/datasets?publisher=NSW+State+Archives&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=National+Archives+of+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Libraries+Tasmania&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Records&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Records+Office+of+Western+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Victoria&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+NSW&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Mount+Gambier+Library&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=National+Library+of+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Queensland&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+Western+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+South+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=State+Library+of+New+South+Wales&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Western+Australian+Museum&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=South+Australian+Museum&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Museum+of+Applied+Arts+and+Sciences&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=Tasmanian+Museum+and+Art+Gallery&limit=100
https://data.gov.au/api/v0/search/datasets?publisher=History+Trust+of+South+Australia&limit=100
https://data.gov.au/api/v0/search/datasets?query=%22Queensland+State+Archives%22&limit=100
In [16]:
df = pd.DataFrame(datafiles)
df.head()
Out[16]:
dataset_description dataset_issued dataset_modified dataset_title download_url end_date file_description file_issued file_modified file_title format info_url licence publisher source start_date
0 Name search for railway employees who died in ... 2014-09-30T04:45:10Z 2016-07-20T12:15:24Z NSW Government Railways and Tramways Roll of H... http://data.nsw.gov.au/data/storage/f/2014-09-... None This dataset contains the following attributes... 2014-09-30T00:46:33Z None NSW Govt Railways and Tramways - Roll of Honou... CSV https://data.nsw.gov.au/data/dataset/33809e06-... Creative Commons Attribution NSW State Archives New South Wales Government None
1 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see: [Closer S... 2013-05-28T01:08:57Z None Closer Settlement Promotion files, 1913- CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
2 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see this page:... 2013-05-28T01:09:42Z None Closer Settlement Transfer Registers, Jul 1919... CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
3 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see this page:... 2013-05-28T01:10:39Z None Closer Settlement and Returned Soldier’s Trans... CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
4 The passage of the Returned Soldiers Settlemen... 2013-05-28T05:07:29Z 2016-06-16T01:18Z Soldier Settlement Indexes https://data.nsw.gov.au/data/dataset/78fe0108-... None For a description of this data, see this page:... 2013-05-28T01:11:22Z None Registers of Settlement Purchase, 1905-1929 CSV https://data.nsw.gov.au/data/dataset/78fe0108-... Creative Commons Attribution NSW State Archives New South Wales Government None
In [17]:
df.shape
Out[17]:
(835, 16)
In [18]:
df['format'].value_counts()
Out[18]:
CSV           447
XML            79
JSON           73
XLSX           54
ESRI REST      41
HTML           34
DOCX           33
PLAIN          16
ZIP            13
GEOJSON         8
API             8
DATA            6
OTHER           4
RSS             2
JPEG            2
KML             2
MPK             2
APP             1
CSS             1
JAVASCRIPT      1
PDF             1
HMTL            1
WFS             1
WMS             1
Name: format, dtype: int64
In [19]:
df['licence'].value_counts()
Out[19]:
Creative Commons Attribution                                  310
Creative Commons Attribution 4.0                              168
Creative Commons Attribution 3.0 Australia                    156
Creative Commons Attribution 4.0 International                110
License not specified                                          41
Creative Commons Attribution 2.5 Australia                     15
Creative Commons Attribution-NonCommercial                     10
notspecified                                                    5
Other (Open)                                                    4
Creative Commons Attribution 3.0                                3
Creative Commons Attribution Share-Alike                        3
Creative Commons Non-Commercial (Any)                           2
Other (Non-Commercial)                                          1
Creative Commons Attribution Share Alike 4.0 International      1
Name: licence, dtype: int64
In [20]:
df['publisher'].value_counts()
Out[20]:
Queensland State Archives                    172
State Library of Western Australia           147
State Library of South Australia             128
State Library of Queensland                  101
Libraries Tasmania                            71
State Records Office of Western Australia     44
State Records                                 41
South Australian Museum                       33
State Library of New South Wales              21
NSW State Archives                            19
History Trust of South Australia              17
Western Australian Museum                     14
State Library of Victoria                      6
State Library of NSW                           6
National Library of Australia                  5
Museum of Applied Arts and Sciences            3
National Archives of Australia                 3
Tasmanian Museum and Art Gallery               2
Mount Gambier Library                          2
Name: publisher, dtype: int64
In [21]:
df.to_csv('glam_datasets_all_formats_from_datagovau.csv', index=False)
display(FileLink('glam_datasets_all_formats_from_datagovau.csv'))
In [22]:
csvs = df.loc[df['format'] == 'CSV']
In [23]:
csvs.shape
Out[23]:
(447, 16)
In [25]:
csvs['publisher'].value_counts()
Out[25]:
Queensland State Archives                    122
State Library of Queensland                   80
State Library of Western Australia            76
State Library of South Australia              66
State Records                                 29
Libraries Tasmania                            23
NSW State Archives                            17
State Library of New South Wales              10
South Australian Museum                        8
State Library of Victoria                      6
History Trust of South Australia               5
State Records Office of Western Australia      3
Mount Gambier Library                          1
National Archives of Australia                 1
Name: publisher, dtype: int64
In [24]:
csvs.to_csv('glam_datasets_csvs_from_datagovau.csv', index=False)
display(FileLink('glam_datasets_csvs_from_datagovau.csv'))
In [ ]: