The National Archives of Australia's RecordSearch database includes some information about files that we're not allowed to see. These files have been through the access examination process and ended up with an access status of 'closed'. You can read about my efforts to extract and interpret this data in Inside Story.
While you can search by access status in RecordSearch, you can't explore the reasons, so if you want to dig any deeper you need to harvest the data. This notebook shows you how.
import math
import re
import time
import datetime
import json
from tqdm import tqdm_notebook
import pandas as pd
from pandas.io.json import json_normalize
import ipywidgets as widgets
from IPython.display import display, HTML, FileLink
from tinydb import TinyDB, Query
from recordsearch_tools.client import RSSearchClient, RSItemClient
from recordsearch_tools.utilities import retry
# Regular expressions to match against the reasons in RS to normalise them
EXCEPTIONS = [
['33(1)(a)', r'33\(1\)\(a\)'],
['33(1)(b)', r'33\(1\)[a\(\)]*\(b\)'],
['33(1)(c)', r'33\(1\)[ab\(\)]*\(c\)'],
['33(1)(d)', r'33\(1\)[abc\(\)]*\(d\)'],
['33(1)(e)(i)', r'33\(1\)[abcd\(\)]*\(e\)\(i\)'],
['33(1)(e)(ii)', r'33\(1\)[abcd\(\)]*\(e\)\(ii\)'],
['33(1)(e)(iii)', r'33\(1\)[abcd\(\)]*\(e\)\(iii\)'],
['33(1)(f)(i)', r'33\(1\)[abcdei\(\)]*\(f\)\(i\)'],
['33(1)(f)(ii)', r'33\(1\)[abcdei\(\)]*\(f\)\(ii\)'],
['33(1)(f)(iii)', r'33\(1\)[abcdei\(\)]*\(f\)\(iii\)'],
['33(1)(g)', r'33\(1\)[abcdefi\(\)]*\(g\)*'],
['33(1)(h)', r'33\(1\)[abcdefgi\(\)]*\(h\)'],
['33(1)(j)', r'33\(1\)[abcdefghi\(\)]*\(j\)'],
['33(2)(a)', r'33\(2\)\(a\)'],
['33(2)(b)', r'33\(2\)[a\(\)]*\(b\)'],
['33(3)(a)(i)', r'33\(3\)\(a\)\(i\)'],
['33(3)(a)(ii)', r'33\(3\)\(a\)\(ii\)'],
['33(3)(b)', r'33\(3\)[ai\(\) &]*\(b\)'],
['Closed period', r'Closed period.*']
]
class SearchHarvester():
"""
Harvest the details of 'Closed' files from RecordSearch.
Saves to a TinyDB database.
harvester = SearchHarvester()
harvester.start_harvest()
"""
def __init__(self, **kwargs):
self.total_pages = None
self.client = RSSearchClient()
self.prepare_harvest(access='Closed')
self.db = TinyDB('data/db-closed.json', default_table='items')
@retry(ConnectionError, tries=20, delay=10, backoff=1)
def prepare_harvest(self, **kwargs):
self.client.search(**kwargs)
total_results = self.client.total_results
print('{} items'.format(total_results))
self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1
print('{} pages'.format(self.total_pages))
@retry(ConnectionError, tries=20, delay=10, backoff=1)
def process_item(self, result):
item_client = RSItemClient()
# Search results don't include all the details, so get the full item record
item = item_client.get_summary(entity_id=result['identifier'], date_format='iso')
item['reasons'] = []
# The access reason field can munge together mutiple reasons, so we need to separate & normalise
for reason in item['access_reason']:
matched = False
# Loop through the regexp patterns to see what we can find in the access reason field, save any matches
for exception, pattern in EXCEPTIONS:
if re.match(pattern, reason['reason']):
item['reasons'].append(exception)
matched = True
if not matched:
# If nothing matches, just save the original
item['reasons'].append(reason['reason'])
return item
@retry(ConnectionError, tries=20, delay=10, backoff=1)
def start_harvest(self, start=1):
Record = Query()
for page in tqdm_notebook(range(start, self.total_pages + 1), unit='page', desc='Pages:'):
response = self.client.search(access='Closed', page=page, sort='9')
for result in tqdm_notebook(response['results'], unit='items', desc='Items on page:', leave=False):
# Save some time by ignoring records we've already harvested
# Handy if you're restarting a failed harvest
if not self.db.contains(Record.identifier == result['identifier']):
item = self.process_item(result)
self.db.upsert(item, Record.identifier == item['identifier'])
time.sleep(1)
# Run the harvest (may take a few hours)
harvester = SearchHarvester()
harvester.start_harvest()
11953 items 598 pages
def get_data():
# Load the data from TinyDB
db = TinyDB('data/db-closed.json', default_table='items')
return db.all()
def make_filename():
filename = 'data/closed-{}'.format(datetime.datetime.now().strftime('%Y%m%d'))
return filename
def save_csv():
items = get_data()
# Flatten the date fields using json_normalise and convert to a dataframe
df = pd.DataFrame(json_normalize(items))
# Rename the dates columns
df.rename(columns={'access_decision.date_str': 'access_decision_date_str', 'access_decision.start_date': 'access_decision_date', 'contents_dates.date_str': 'contents_date_str', 'contents_dates.start_date': 'contents_start_date', 'contents_dates.end_date': 'contents_end_date'}, inplace=True)
# Get the columns we want and put them in a nice order
df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_date_str', 'contents_start_date', 'contents_end_date', 'location', 'access_status', 'access_decision_date_str', 'access_decision_date', 'reasons']]
# Save the reasons lists as | separated strings
df2 = df.copy()
df2['reasons'] = df['reasons'].str.join('|')
filename = '{}.csv'.format(make_filename())
df2.to_csv(filename, index=False)
display(FileLink(filename))
def save_json():
items = get_data()
filename = '{}.json'.format(make_filename())
with open(filename, 'w') as json_file:
json.dump(items, json_file)
display(FileLink(filename))
# Save results as CSV and JSON and provide handy download links
save_csv()
save_json()