#!/usr/bin/env python # coding: utf-8 # # Harvesting functions from the RecordSearch interface # # This notebook attempts to extract information from the RecordSearch interface about the hierarchy of functions it uses to describe the work of government agencies. # # [Previous explorations](https://timsherratt.org/research-notebook/aggregated-archives/notes/naa-functions/) have shown that the NAA's use of functions is rather inconsistent. All I'm doing here is finding out what functions RecordSearch itself says it is using. This may not be complete, but it seems like a useful starting point. # # There are a few inconsistencies that I've tried to clean up. In particular, the hierarchy is broken in a number of places where a child term links up to a non-preferred term. In this case I've replaced the non-preferred term with the preferred term. # # I've also noticed that some 'narrower' terms don't have their own entries in the main list, so I've made sure that these are all added in. # # I suspect that the majority of these terms are never used, but we'll save that question for another notebook... # In[1]: import datetime import json import os import re from copy import deepcopy from operator import itemgetter import mechanicalsoup import pandas as pd from IPython.display import FileLink, display from tinydb import Query, TinyDB from tinydb.operations import set from tqdm.auto import tqdm # Make sure there's somewhere to save data files os.makedirs("data", exist_ok=True) # In[2]: # The harvesting code # Much kludginess here to deal with inconsistencies in RS BROKEN_HIERARCHIES = { # borked parent: good parent "australian defence forces (adf)": "defence forces", "immigration": "migration", "community protection": "customs", "security": "security and intelligence", "finance management": "financial matters", "education and training": "education", "governance": None, "customs regulations": "customs", "employment services": "employment", "health care": "health", "maritime services": "sea transport", "early childhood education": "education", "fiscal policy": "financial matters", "marine and rural regulation": "primary industries", "civic infrastructure": "works", "retirement income": "financial matters", "import regulation": "trade", } MOVE_DUPLICATE_PARENTS = { # term: correct parent "rail transport": "land transport", "road transport": "land transport", "tariff regulation": "customs", "overseas aid programs": "international relations", "consular services": "international relations", } DELETE_DUPLICATE_CHILDREN = { # term: children to delete "transport": ["rail transport", "road transport"], "trade": ["tariff regulation"], "foreign policy": ["overseas aid programs"], "government representation overseas": ["consular services"], } # In[3]: db = TinyDB("data/db_functions.json") def save_terms(row): Record = Query() links = row.find_all("a", href=True) for link in links: db.upsert({"term": link.string.lower()}, Record.term == link.string.lower()) def save_relations(row): Record = Query() term = row.find("a").string.lower() if row.find("table"): # Loop through the rows underneath the term heading to get info about related terms for related in row.find("table").find_all("tr"): cells = related.find_all("td") if re.search(r"Broad term", cells[0].string): # This is the parent of the current term parent = cells[1].find("a").string.lower() if parent in BROKEN_HIERARCHIES: parent = BROKEN_HIERARCHIES[parent] if term in MOVE_DUPLICATE_PARENTS: parent = MOVE_DUPLICATE_PARENTS[term] if parent: db.update(set("parent", parent), Record.term == term) if re.search(r"Narrow terms", cells[0].string): if term in BROKEN_HIERARCHIES: term = BROKEN_HIERARCHIES[term] for link in cells[1].find_all("a"): # These are children of the current term child = link.string.lower() if not ( term in DELETE_DUPLICATE_CHILDREN and child in DELETE_DUPLICATE_CHILDREN[term] ): db.update(set("parent", term), Record.term == child) def process_pages(loop): browser = mechanicalsoup.StatefulBrowser() browser.open("https://recordsearch.naa.gov.au/scripts/Logon.asp?N=guest") browser.select_form('form[id="t"]') browser.submit_selected() browser.open( "http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/AdvSearchFunctionsBrowsing.aspx" ) for letter in tqdm(range(0, 26)): form = browser.select_form("#formSNRMaster") form.new_control( "input", "__EVENTTARGET", f"ctl00$ContentPlaceHolderSNR$ctl{str(letter).zfill(2)}", ) browser.submit_selected() try: for row in browser.page.find( id="ContentPlaceHolderSNR_dlFunctions" ).find_all("tr", recursive=False): if loop == 1: save_terms(row) elif loop == 2: save_relations(row) except AttributeError: # No terms on this page pass def harvest_functions(): # Try looping through twice to deal with the inconsistences in the way relationships are defined # First loop is to get every term that has a link back into RS indicating that it might be used process_pages(loop=1) # Ok now we have every term, let's try to put them in a hierarchy process_pages(loop=2) def get_children(parent): Record = Query() term = {"term": parent["term"], "narrower": []} children = db.search(Record.parent == parent["term"]) for child in children: term["narrower"].append(get_children(child)) return term def make_hierarchy(): Record = Query() terms = [] parents = db.search(~(Record.parent.exists())) for parent in parents: terms.append(get_children(parent)) return sorted(terms, key=itemgetter("term")) # In[ ]: harvest_functions() functions = make_hierarchy() # ## Save and download the results # # Save the harvested functions as text and JSON files for easy download. # In[5]: def get_text_levels(function, level): f_list = [] if "narrower" in function: level += 1 for subf in function["narrower"]: f_list.append("{}{} {}".format(level * " ", level * "-", subf["term"])) f_list += get_text_levels(subf, level=level) return f_list def save_text(functions, today): functions_list = [] for function in functions: functions_list.append(function["term"].upper()) functions_list += get_text_levels(function, level=0) with open(f"data/functions-{today}.txt", "w") as text_file: for row in functions_list: text_file.write("{}\n".format(row)) def get_csv_levels(function, row): rows = [] if "narrower" in function: key = "level{}".format(len(row) + 1) for subf in function["narrower"]: this_row = deepcopy(row) this_row[key] = subf["term"] rows.append(this_row) rows += get_csv_levels(subf, this_row) return rows def save_csv(functions, today): rows = [] for function in functions: row = {"level1": function["term"]} rows.append(row) rows += get_csv_levels(function, row) df = pd.DataFrame(rows) df.to_csv(f"data/functions-{today}.csv", index=False) def save_functions(functions): """ Saves the harvested list of functions in text, json, and csv. """ today = datetime.datetime.now().strftime("%Y%m%d") with open(f"data/functions-{today}.json", "w") as json_file: json.dump(functions, json_file, indent=4) save_text(functions, today) save_csv(functions, today) display(FileLink(f"data/functions-{today}.txt")) display(FileLink(f"data/functions-{today}.json")) display(FileLink(f"data/functions-{today}.csv")) # In[6]: save_functions(functions) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) as part of the [GLAM Workbench](https://glam-workbench.github.io/).