Notebook

Harvest files with the access status of 'closed'¶

The National Archives of Australia's RecordSearch database includes some information about files that we're not allowed to see. These files have been through the access examination process and ended up with an access status of 'closed'. You can read about my efforts to extract and interpret this data in Inside Story.

While you can search by access status in RecordSearch, you can't explore the reasons, so if you want to dig any deeper you need to harvest the data. This notebook shows you how.

This code used in this notebook is similar to that in harvesting items from a search. The only real difference is that full items records are harvested by default, and the access reasons are processed to separate and normalise munged-together values.

This notebook uses the RecordSearch Data Scraper to do most of the work. Note that the RecordSearch Data Scraper caches results to improve efficiency. This also makes it easy to resume a failed harvest. If you want to completely refresh a harvest, then delete the cache_db.sqlite file to start from scratch.

Setting things up¶

In [ ]:

import json
import re
import time
from datetime import datetime
from pathlib import Path

import pandas as pd
from IPython.display import FileLink, display
from recordsearch_data_scraper.scrapers import RSItemSearch
from tqdm.auto import tqdm

In [ ]:

# Regular expressions to match against the reasons in RS to normalise them
EXCEPTIONS = [
    ["33(1)(a)", r"33\(1\)\(a\)"],
    ["33(1)(b)", r"33\(1\)[a\(\)]*\(b\)"],
    ["33(1)(c)", r"33\(1\)[ab\(\)]*\(c\)"],
    ["33(1)(d)", r"33\(1\)[abc\(\)]*\(d\)"],
    ["33(1)(e)(i)", r"33\(1\)[abcd\(\)]*\(e\)\(i\)"],
    ["33(1)(e)(ii)", r"33\(1\)[abcd\(\)]*\(e\)\(ii\)"],
    ["33(1)(e)(iii)", r"33\(1\)[abcd\(\)]*\(e\)\(iii\)"],
    ["33(1)(f)(i)", r"33\(1\)[abcdei\(\)]*\(f\)\(i\)"],
    ["33(1)(f)(ii)", r"33\(1\)[abcdei\(\)]*\(f\)\(ii\)"],
    ["33(1)(f)(iii)", r"33\(1\)[abcdei\(\)]*\(f\)\(iii\)"],
    ["33(1)(g)", r"33\(1\)[abcdefi\(\)]*\(g\)*"],
    ["33(1)(h)", r"33\(1\)[abcdefgi\(\)]*\(h\)"],
    ["33(1)(j)", r"33\(1\)[abcdefghi\(\)]*\(j\)"],
    ["33(2)(a)", r"33\(2\)\(a\)"],
    ["33(2)(b)", r"33\(2\)[a\(\)]*\(b\)"],
    ["33(3)(a)(i)", r"33\(3\)\(a\)\(i\)"],
    ["33(3)(a)(ii)", r"33\(3\)\(a\)(\(i\))?\(ii\)"],
    ["33(3)(b)", r"33\(3\)[ai\(\) &]*\(b\)"],
    ["Closed period", r"Closed period.*"],
]

Harvest the records¶

In [ ]:

def normalise_reasons(items):
    """
    Uses a set of regex patterns to try and extract a set of individual reasons from the reasons values,
    which can sometimes be munged together.
    """
    for item in items:
        item["reasons"] = []
        try:
            # The access reason field can munge together mutiple reasons, so we need to separate & normalise
            for reason in item["access_decision_reasons"]:
                matched = False
                # Loop through the regexp patterns to see what we can find in the access reason field, save any matches
                for exception, pattern in EXCEPTIONS:
                    if re.match(pattern, reason):
                        item["reasons"].append(exception)
                        matched = True
                if not matched:
                    # If nothing matches, just save the original
                    item["reasons"].append(reason)
        except KeyError:
            print(item)
            raise

    return items


items = []
search = RSItemSearch(record_detail="full", access="Closed")
with tqdm(total=search.total_results) as pbar:
    more = True
    while more:
        data = search.get_results()
        if data["results"]:
            items += normalise_reasons(data["results"])
            pbar.update(len(data["results"]))
            time.sleep(0.5)
        else:
            more = False

Save the results for download¶

In [ ]:

def save_harvest(search, items):
    params = search.params.copy()
    params.update(search.kwargs)
    today = datetime.now()
    search_param_str = "_".join(
        sorted(
            [
                f"{k}_{v}"
                for k, v in params.items()
                if v is not None and k not in ["results_per_page", "sort"]
            ]
        )
    )
    data_dir = Path("harvests", f'{today.strftime("%Y%m%d")}_{search_param_str}')
    data_dir.mkdir(exist_ok=True, parents=True)
    metadata = {
        "date_harvested": today.isoformat(),
        "search_params": search.params,
        "search_kwargs": search.kwargs,
        "total_results": search.total_results,
        "total_harvested": len(items),
    }

    with Path(data_dir, "metadata.json").open("w") as md_file:
        json.dump(metadata, md_file)

    with Path(data_dir, "results.jsonl").open("w") as data_file:
        for item in items:
            data_file.write(json.dumps(item) + "\n")

    df = pd.json_normalize(items)
    df.to_csv(Path(data_dir, "results.csv"), index=False)
    display(FileLink(Path(data_dir, "metadata.json")))
    display(FileLink(Path(data_dir, "results.jsonl")))
    display(FileLink(Path(data_dir, "results.csv")))
    return str(data_dir)

In [ ]:

save_harvest(search, items)

Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!