Notebook

Harvesting functions from the RecordSearch interface¶

This notebook attempts to extract information from the RecordSearch interface about the hierarchy of functions it uses to describe the work of government agencies.

Previous explorations have shown that the NAA's use of functions is rather inconsistent. All I'm doing here is finding out what functions RecordSearch itself says it is using. This may not be complete, but it seems like a useful starting point.

There are a few inconsistencies that I've tried to clean up. In particular, the hierarchy is broken in a number of places where a child term links up to a non-preferred term. In this case I've replaced the non-preferred term with the preferred term.

I've also noticed that some 'narrower' terms don't have their own entries in the main list, so I've made sure that these are all added in.

I suspect that the majority of these terms are never used, but we'll save that question for another notebook...

In [1]:

import datetime
import json
import os
import re
from copy import deepcopy
from operator import itemgetter

import mechanicalsoup
import pandas as pd
from IPython.display import FileLink, display
from tinydb import Query, TinyDB
from tinydb.operations import set
from tqdm.auto import tqdm

# Make sure there's somewhere to save data files
os.makedirs("data", exist_ok=True)

In [2]:

# The harvesting code
# Much kludginess here to deal with inconsistencies in RS

BROKEN_HIERARCHIES = {
    # borked parent: good parent
    "australian defence forces (adf)": "defence forces",
    "immigration": "migration",
    "community protection": "customs",
    "security": "security and intelligence",
    "finance management": "financial matters",
    "education and training": "education",
    "governance": None,
    "customs regulations": "customs",
    "employment services": "employment",
    "health care": "health",
    "maritime services": "sea transport",
    "early childhood education": "education",
    "fiscal policy": "financial matters",
    "marine and rural regulation": "primary industries",
    "civic infrastructure": "works",
    "retirement income": "financial matters",
    "import regulation": "trade",
}

MOVE_DUPLICATE_PARENTS = {
    # term: correct parent
    "rail transport": "land transport",
    "road transport": "land transport",
    "tariff regulation": "customs",
    "overseas aid programs": "international relations",
    "consular services": "international relations",
}

DELETE_DUPLICATE_CHILDREN = {
    # term: children to delete
    "transport": ["rail transport", "road transport"],
    "trade": ["tariff regulation"],
    "foreign policy": ["overseas aid programs"],
    "government representation overseas": ["consular services"],
}

In [3]:

db = TinyDB("data/db_functions.json")


def save_terms(row):
    Record = Query()
    links = row.find_all("a", href=True)
    for link in links:
        db.upsert({"term": link.string.lower()}, Record.term == link.string.lower())


def save_relations(row):
    Record = Query()
    term = row.find("a").string.lower()
    if row.find("table"):
        # Loop through the rows underneath the term heading to get info about related terms
        for related in row.find("table").find_all("tr"):
            cells = related.find_all("td")
            if re.search(r"Broad term", cells[0].string):
                # This is the parent of the current term
                parent = cells[1].find("a").string.lower()
                if parent in BROKEN_HIERARCHIES:
                    parent = BROKEN_HIERARCHIES[parent]
                if term in MOVE_DUPLICATE_PARENTS:
                    parent = MOVE_DUPLICATE_PARENTS[term]
                if parent:
                    db.update(set("parent", parent), Record.term == term)
            if re.search(r"Narrow terms", cells[0].string):
                if term in BROKEN_HIERARCHIES:
                    term = BROKEN_HIERARCHIES[term]
                for link in cells[1].find_all("a"):
                    # These are children of the current term
                    child = link.string.lower()
                    if not (
                        term in DELETE_DUPLICATE_CHILDREN
                        and child in DELETE_DUPLICATE_CHILDREN[term]
                    ):
                        db.update(set("parent", term), Record.term == child)


def process_pages(loop):
    browser = mechanicalsoup.StatefulBrowser()
    browser.open("https://recordsearch.naa.gov.au/scripts/Logon.asp?N=guest")
    browser.select_form('form[id="t"]')
    browser.submit_selected()
    browser.open(
        "http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/AdvSearchFunctionsBrowsing.aspx"
    )
    for letter in tqdm(range(0, 26)):
        form = browser.select_form("#formSNRMaster")

        form.new_control(
            "input",
            "__EVENTTARGET",
            f"ctl00$ContentPlaceHolderSNR$ctl{str(letter).zfill(2)}",
        )
        browser.submit_selected()
        try:
            for row in browser.page.find(
                id="ContentPlaceHolderSNR_dlFunctions"
            ).find_all("tr", recursive=False):
                if loop == 1:
                    save_terms(row)
                elif loop == 2:
                    save_relations(row)
        except AttributeError:
            # No terms on this page
            pass


def harvest_functions():
    # Try looping through twice to deal with the inconsistences in the way relationships are defined
    # First loop is to get every term that has a link back into RS indicating that it might be used
    process_pages(loop=1)
    # Ok now we have every term, let's try to put them in a hierarchy
    process_pages(loop=2)


def get_children(parent):
    Record = Query()
    term = {"term": parent["term"], "narrower": []}
    children = db.search(Record.parent == parent["term"])
    for child in children:
        term["narrower"].append(get_children(child))
    return term


def make_hierarchy():
    Record = Query()
    terms = []
    parents = db.search(~(Record.parent.exists()))
    for parent in parents:
        terms.append(get_children(parent))
    return sorted(terms, key=itemgetter("term"))

In [ ]:

harvest_functions()
functions = make_hierarchy()

Save and download the results¶

Save the harvested functions as text and JSON files for easy download.

In [5]:

def get_text_levels(function, level):
    f_list = []
    if "narrower" in function:
        level += 1
        for subf in function["narrower"]:
            f_list.append("{}{} {}".format(level * "  ", level * "-", subf["term"]))
            f_list += get_text_levels(subf, level=level)
    return f_list


def save_text(functions, today):
    functions_list = []
    for function in functions:
        functions_list.append(function["term"].upper())
        functions_list += get_text_levels(function, level=0)
    with open(f"data/functions-{today}.txt", "w") as text_file:
        for row in functions_list:
            text_file.write("{}\n".format(row))


def get_csv_levels(function, row):
    rows = []
    if "narrower" in function:
        key = "level{}".format(len(row) + 1)
        for subf in function["narrower"]:
            this_row = deepcopy(row)
            this_row[key] = subf["term"]
            rows.append(this_row)
            rows += get_csv_levels(subf, this_row)
    return rows


def save_csv(functions, today):
    rows = []
    for function in functions:
        row = {"level1": function["term"]}
        rows.append(row)
        rows += get_csv_levels(function, row)
    df = pd.DataFrame(rows)
    df.to_csv(f"data/functions-{today}.csv", index=False)


def save_functions(functions):
    """
    Saves the harvested list of functions in text, json, and csv.
    """
    today = datetime.datetime.now().strftime("%Y%m%d")
    with open(f"data/functions-{today}.json", "w") as json_file:
        json.dump(functions, json_file, indent=4)
    save_text(functions, today)
    save_csv(functions, today)
    display(FileLink(f"data/functions-{today}.txt"))
    display(FileLink(f"data/functions-{today}.json"))
    display(FileLink(f"data/functions-{today}.csv"))

In [6]:

save_functions(functions)

data/functions-20220510.txt

data/functions-20220510.json

data/functions-20220510.csv

Created by Tim Sherratt as part of the GLAM Workbench.