Harvest files with the access status of 'closed'

The National Archives of Australia's RecordSearch database includes some information about files that we're not allowed to see. These files have been through the access examination process and ended up with an access status of 'closed'. You can read about my efforts to extract and interpret this data in Inside Story.

While you can search by access status in RecordSearch, you can't explore the reasons, so if you want to dig any deeper you need to harvest the data. This notebook shows you how.

Setting things up

In [138]:
import math
import re
import time
import datetime
import json
from tqdm import tqdm_notebook
import pandas as pd
from pandas.io.json import json_normalize
import ipywidgets as widgets
from IPython.display import display, HTML, FileLink
from tinydb import TinyDB, Query
from recordsearch_tools.client import RSSearchClient, RSItemClient
from recordsearch_tools.utilities import retry
In [89]:
# Regular expressions to match against the reasons in RS to normalise them
EXCEPTIONS = [
    ['33(1)(a)', r'33\(1\)\(a\)'],
    ['33(1)(b)', r'33\(1\)[a\(\)]*\(b\)'],
    ['33(1)(c)', r'33\(1\)[ab\(\)]*\(c\)'],
    ['33(1)(d)', r'33\(1\)[abc\(\)]*\(d\)'],
    ['33(1)(e)(i)', r'33\(1\)[abcd\(\)]*\(e\)\(i\)'],
    ['33(1)(e)(ii)', r'33\(1\)[abcd\(\)]*\(e\)\(ii\)'],
    ['33(1)(e)(iii)', r'33\(1\)[abcd\(\)]*\(e\)\(iii\)'],
    ['33(1)(f)(i)', r'33\(1\)[abcdei\(\)]*\(f\)\(i\)'],
    ['33(1)(f)(ii)', r'33\(1\)[abcdei\(\)]*\(f\)\(ii\)'],
    ['33(1)(f)(iii)', r'33\(1\)[abcdei\(\)]*\(f\)\(iii\)'],
    ['33(1)(g)', r'33\(1\)[abcdefi\(\)]*\(g\)*'],
    ['33(1)(h)', r'33\(1\)[abcdefgi\(\)]*\(h\)'],
    ['33(1)(j)', r'33\(1\)[abcdefghi\(\)]*\(j\)'],
    ['33(2)(a)', r'33\(2\)\(a\)'],
    ['33(2)(b)', r'33\(2\)[a\(\)]*\(b\)'],
    ['33(3)(a)(i)', r'33\(3\)\(a\)\(i\)'],
    ['33(3)(a)(ii)', r'33\(3\)\(a\)\(ii\)'],
    ['33(3)(b)', r'33\(3\)[ai\(\) &]*\(b\)'],
    ['Closed period', r'Closed period.*']
]
In [93]:
class SearchHarvester():
    """
    Harvest the details of 'Closed' files from RecordSearch.
    Saves to a TinyDB database.
    harvester = SearchHarvester()
    harvester.start_harvest()
    """
    
    def __init__(self, **kwargs):
        self.total_pages = None
        self.client = RSSearchClient()
        self.prepare_harvest(access='Closed')
        self.db = TinyDB('data/db-closed.json', default_table='items')
    
    @retry(ConnectionError, tries=20, delay=10, backoff=1)
    def prepare_harvest(self, **kwargs):
        self.client.search(**kwargs)
        total_results = self.client.total_results
        print('{} items'.format(total_results))
        self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1
        print('{} pages'.format(self.total_pages))
    
    @retry(ConnectionError, tries=20, delay=10, backoff=1)
    def process_item(self, result):
        item_client = RSItemClient()
        # Search results don't include all the details, so get the full item record
        item = item_client.get_summary(entity_id=result['identifier'], date_format='iso')
        item['reasons'] = []
        # The access reason field can munge together mutiple reasons, so we need to separate & normalise
        for reason in item['access_reason']:
            matched = False
            # Loop through the regexp patterns to see what we can find in the access reason field, save any matches
            for exception, pattern in EXCEPTIONS:
                if re.match(pattern, reason['reason']):
                    item['reasons'].append(exception)
                    matched = True
            if not matched:
                # If nothing matches, just save the original
                item['reasons'].append(reason['reason'])
        return item
    
    @retry(ConnectionError, tries=20, delay=10, backoff=1)
    def start_harvest(self, start=1):
        Record = Query()
        for page in tqdm_notebook(range(start, self.total_pages + 1), unit='page', desc='Pages:'):
            response = self.client.search(access='Closed', page=page, sort='9')
            for result in tqdm_notebook(response['results'], unit='items', desc='Items on page:', leave=False):
                # Save some time by ignoring records we've already harvested
                # Handy if you're restarting a failed harvest
                if not self.db.contains(Record.identifier == result['identifier']):
                    item = self.process_item(result)
                    self.db.upsert(item, Record.identifier == item['identifier'])
            time.sleep(1)

Start the harvest

In [94]:
# Run the harvest (may take a few hours)
harvester = SearchHarvester()
harvester.start_harvest()
11953 items
598 pages

Save the results for download

In [142]:
def get_data():
    # Load the data from TinyDB
    db = TinyDB('data/db-closed.json', default_table='items')
    return db.all()

def make_filename():
    filename = 'data/closed-{}'.format(datetime.datetime.now().strftime('%Y%m%d'))
    return filename

def save_csv():
    items = get_data()
    # Flatten the date fields using json_normalise and convert to a dataframe
    df = pd.DataFrame(json_normalize(items))
    # Rename the dates columns
    df.rename(columns={'access_decision.date_str': 'access_decision_date_str', 'access_decision.start_date': 'access_decision_date', 'contents_dates.date_str': 'contents_date_str', 'contents_dates.start_date': 'contents_start_date', 'contents_dates.end_date': 'contents_end_date'}, inplace=True)
    # Get the columns we want and put them in a nice order
    df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_date_str', 'contents_start_date', 'contents_end_date', 'location', 'access_status', 'access_decision_date_str', 'access_decision_date', 'reasons']]
    # Save the reasons lists as | separated strings
    df2 = df.copy()
    df2['reasons'] = df['reasons'].str.join('|')
    filename = '{}.csv'.format(make_filename())
    df2.to_csv(filename, index=False)
    display(FileLink(filename))
    
def save_json():
    items = get_data()
    filename = '{}.json'.format(make_filename())
    with open(filename, 'w') as json_file:
        json.dump(items, json_file)
    display(FileLink(filename))
In [143]:
# Save results as CSV and JSON and provide handy download links
save_csv()
save_json()