Harvesting functions from the RecordSearch interface

This notebook attempts to extract information from the RecordSearch interface about the hierarchy of functions it uses to describe the work of government agencies.

Previous explorations have shown that the NAA's use of functions is rather inconsistent. All I'm doing here is finding out what functions RecordSearch itself says it is using. This may not be complete, but it seems like a useful starting point.

There are a few inconsistencies that I've tried to clean up. In particular, the hierarchy is broken in a number of places where a child term links up to a non-preferred term. In this case I've replaced the non-preferred term with the preferred term.

I've also noticed that some 'narrower' terms don't have their own entries in the main list, so I've made sure that these are all added in.

I suspect that the majority of these terms are never used, but we'll save that question for another notebook...

In [53]:
from copy import deepcopy
from operator import itemgetter
import json
import os
import robobrowser
import re
import pandas as pd
from IPython.display import display, HTML, FileLink
from tqdm import tqdm_notebook
from tinydb import TinyDB, Query
from tinydb.operations import set
from recordsearch_tools.client import RSClient, RSAgencySearchClient

# Make sure there's somewhere to save data files
os.makedirs('data', exist_ok=True)
In [121]:
# The harvesting code
# Much kludginess here to deal with inconsistencies in RS

BROKEN_HIERARCHIES = {
    # borked parent: good parent
    'australian defence forces (adf)': 'defence forces',
    'immigration': 'migration',
    'community protection': 'customs',
    'security': 'security and intelligence',
    'finance management': 'financial matters',
    'education and training': 'education',
    'governance': None,
    'customs regulations': 'customs',
    'employment services': 'employment',
    'health care': 'health',
    'maritime services': 'sea transport',
    'early childhood education': 'education',
    'fiscal policy': 'financial matters',
    'marine and rural regulation': 'primary industries',
    'civic infrastructure': 'works',
    'retirement income': 'financial matters',
    'import regulation': 'trade'
}

MOVE_DUPLICATE_PARENTS = {
    # term: correct parent
    'rail transport': 'land transport',
    'road transport': 'land transport',
    'tariff regulation': 'customs',
    'overseas aid programs': 'international relations',
    'consular services': 'international relations'
}

DELETE_DUPLICATE_CHILDREN = {
    # term: children to delete
    'transport': ['rail transport', 'road transport'],
    'trade': ['tariff regulation'],
    'foreign policy': ['overseas aid programs'],
    'government representation overseas': ['consular services']
}

class RSFunctionsClient(RSClient):
    '''
    Harvests the functions that are used in the RS functions browse interface.
    '''
    
    def __init__(self):
        super().__init__()
        self.db = TinyDB('data/db_functions.json')
        
        
    def save_terms(self, row):
        Record = Query()
        links = row.find_all('a', href=True)
        for link in links:
            self.db.upsert({'term': link.string.lower()}, Record.term == link.string.lower())
            
            
    def save_relations(self, row):
        Record = Query()
        term = row.find('a').string.lower()
        if row.find('table'):
            # Loop through the rows underneath the term heading to get info about related terms
            for related in row.find('table').find_all('tr'):
                cells = related.find_all('td')
                if re.search(r'Broad term', cells[0].string):
                    # This is the parent of the current term
                    parent = cells[1].find('a').string.lower()
                    if parent in BROKEN_HIERARCHIES:
                        parent = BROKEN_HIERARCHIES[parent]
                    if term in MOVE_DUPLICATE_PARENTS:
                        parent = MOVE_DUPLICATE_PARENTS[term]
                    if parent:
                        self.db.update(set('parent', parent), Record.term == term)
                if re.search(r'Narrow terms', cells[0].string):
                    if term in BROKEN_HIERARCHIES:
                        term = BROKEN_HIERARCHIES[term]
                    for link in cells[1].find_all('a'):
                        # These are children of the current term
                        child = link.string.lower()
                        if not (term in DELETE_DUPLICATE_CHILDREN and child in DELETE_DUPLICATE_CHILDREN[term]):
                            self.db.update(set('parent', term), Record.term == child) 
        
        
    def process_pages(self, loop=1):
        self.br.session.headers.update({'Referer': 'http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/BasicSearch.aspx'})
        self.br.open('http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/AdvSearchFunctionsBrowsing.aspx')
        for letter in tqdm_notebook(range(0, 26)):
            form = self.br.get_form(id='formSNRMaster')
            form['__EVENTTARGET'] = 'ctl00$ContentPlaceHolderSNR$ctl{}'.format(str(letter).zfill(2))
            submit = robobrowser.forms.fields.Input('<input type="submit" value="Submit" name="submit">Submit</input>')
            form.add_field(submit)
            self.br.submit_form(form, submit=form['submit'])
            try:
                for row in self.br.find(id='ContentPlaceHolderSNR_dlFunctions').find_all('tr', recursive=False):
                    if loop == 1:
                        self.save_terms(row)
                    elif loop == 2:
                        self.save_relations(row)
            except AttributeError:
                # No terms on this page
                pass

    
    def harvest_functions(self):
        # Try looping through twice to deal with the inconsistences in the way relationships are defined
        # First loop is to get every term that has a link back into RS indicating that it might be used
        self.process_pages(loop=1)
        # Ok now we have every term, let's try to put them in a hierarchy
        self.process_pages(loop=2)
        
        
    def get_children(self, parent):
        Record = Query()
        term = {'term': parent['term'], 'narrower': []}
        children = self.db.search(Record.parent == parent['term'])
        for child in children:
            term['narrower'].append(self.get_children(child))
        return term


    def make_hierarchy(self):
        Record = Query()
        terms = []
        parents = self.db.search(~(Record.parent.exists()))
        for parent in parents:
            terms.append(self.get_children(parent))
        return sorted(terms, key=itemgetter('term'))
In [122]:
# Start a harvest!
c = RSFunctionsClient()
c.harvest_functions()
functions = c.make_hierarchy()

Save and download the results

Save the harvested functions as text and JSON files for easy download.

In [128]:
def get_text_levels(function, level):
    f_list = []
    if 'narrower' in function:
        level += 1
        for subf in function['narrower']:
            f_list.append('{}{} {}'.format(level * '  ', level * '-', subf['term']))
            f_list += get_text_levels(subf, level=level)
    return f_list

def save_text():
    functions_list = []
    for function in functions:
        functions_list.append(function['term'].upper())
        functions_list += get_text_levels(function, level=0)
    with open('data/functions.txt', 'w') as text_file:
        for row in functions_list:
            text_file.write('{}\n'.format(row))
            
def get_csv_levels(function, row):
    rows = []
    if 'narrower' in function:
        key = 'level{}'.format(len(row) + 1)
        for subf in function['narrower']:
            this_row = deepcopy(row)
            this_row[key] = subf['term']
            rows.append(this_row)
            rows += get_csv_levels(subf, this_row)
    return rows
            
def save_csv():
    rows = []
    for function in functions:
        row = {'level1': function['term']}
        rows.append(row)
        rows += get_csv_levels(function, row)
    df = pd.DataFrame(rows)
    df.to_csv('data/functions.csv', index=False)
    
def save_functions():
    '''
    Saves the harvested list of functions in text, json, and csv.
    '''
    with open('data/functions.json', 'w') as json_file:
        json.dump(functions, json_file, indent=4)
    save_text()
    save_csv()
    display(FileLink('data/functions.txt'))
    display(FileLink('data/functions.json'))
    display(FileLink('data/functions.csv'))
In [ ]: