#!/usr/bin/env python # coding: utf-8 # # DFAT Cable Finder # # **If you ever need to find a file in the National Archives of Australia that contains a specific numbered cable from the Department of Foreign Affairs this is the tool for you!** # # Just give it a cable number and it will look in the series listed below for a file that might contain the cable. For each possible match it returns a link to the file as well as a bit of information about it. # # This tool works because many of the files in these series include the first and last numbered cable in the file title. So all it does is look at the numbers in each file title to see if the cable you're interested in falls somewhere between them. It's simple, but it's not something you can do in RecordSearch. # # It's far from perfect because the way the file titles are constructed are not always consistent, but it's quicker than looking through all the file titles manually. # # Series searched: # # * [A11785](http://www.naa.gov.au/cgi-bin/Search?O=S&Number=A11785) – Top Secret original and spares inward cables, annual single number series (1948-1972) # * [A11786](http://www.naa.gov.au/cgi-bin/Search?O=S&Number=A11786) – Top Secret original and spares outward cables, single number series (1948-1972) # * [A3195](http://www.naa.gov.au/cgi-bin/Search?O=S&Number=A3195) – Master sheets (used stencils) of inwards cables, annual single number series (1939-1949) # * [A3196](http://www.naa.gov.au/cgi-bin/Search?O=S&Number=A3196) – Master sheets (used stencils) of outwards cables, annual single number series (1939-1949) # * [A6364](http://www.naa.gov.au/cgi-bin/Search?O=S&Number=A6364) – Printed copies of inward cables with I (Inward) prefix filed in binders alphabetically by post (1950-1974) # * [A6366](http://www.naa.gov.au/cgi-bin/Search?O=S&Number=A6366) – Printed copies of outward cables with O (Outward) prefix filed in binders alphabetically by post (1950-1974) # # Let me know if you'd like additional series added. If you want to refresh the series data from RecordSearch, just delete the `cables_data.json` file before running a search. The tool will then reharvest all the data. # In[2]: import json import re from copy import deepcopy import ipywidgets as widgets from IPython.display import HTML, display from recordsearch_data_scraper.scrapers import RSItemSearch # In[3]: series = ["A11785", "A11786", "A3195 ", "A3196", "A6364", "A6366"] # In[4]: def get_total_files(series): """ Get the number of files in a series. """ results = RSItemSearch(sort=5, digitised=False, series=series) return int(results.total_results) def get_files(series): """ Harvest file details from a series in RecordSearch """ all_results = [] item_search = RSItemSearch(series=series, sort=5) more = True while more: results = item_search.get_results() all_results += results if not results: more = False return all_results def refresh_data(): """ Harvest data from the listed series and save the results in a json file. """ results = [] for s in series: results += get_files(s) with open("cables_data.json", "w") as json_file: json.dump(results, json_file) return results def load_data(): """ Try to load preharvested data. If the data file doesn't exist, harvest it. """ try: with open("cables_data.json", "r") as json_file: results = json.load(json_file) except (FileNotFoundError, json.JSONDecodeError): results = refresh_data() return results def check_year(r, year): keep = False try: start = int(r["contents_dates"]["start_date"][:4]) end = int(r["contents_dates"]["end_date"][:4]) except (TypeError, KeyError): pass else: if int(year) >= start and int(year) <= end: keep = True return keep def find_cable(cable, series=None, year=None): display_results.clear_output() # Load pre harvested data results = load_data() try: cable_num = int(re.search(r"[OI0]{0,1}\.{0,1}\s*?(\d+)", cable).group(1)) except AttributeError: print("Not a number") filtered_results = deepcopy(results) if series: filtered_results = [r for r in filtered_results if r["series"] == series] if year: filtered_results = [r for r in filtered_results if check_year(r, year) is True] for result in filtered_results: # Start conservatively, looking for O or I in front of numbers cables = re.findall(r"[OI]{1}\.{0,1}\s*?(\d+)", result["title"]) if len(cables) == 0: # If that didn't work find all numbers cables = re.findall(r"\d+", result["title"]) if len(cables) > 2: # If there are too many numbers, exclude ones that look like years cables = [c for c in cables if not re.search(r"^19[1-9]{1}\d{1}$", c)] # Just right # print(cables) if len(cables) == 2: if cable_num >= int(cables[0]) and cable_num <= int(cables[1]): # Display the details of each candidate html = '

NAA: {}, {}'.format( result["identifier"], result["series"], result["control_symbol"] ) html += "
{}".format(result["title"]) html += "
{}".format(result["contents_dates"]["date_str"]) if result["digitised_status"] is True: html += "
Digitised: {} pages".format(result["digitised_pages"]) html += "

" with display_results: display(HTML(html)) def run_query(b): find_cable(cable.value, series=series_select.value, year=year.value) # All the widgety things series_options = [(s, s) for s in series] series_options[0] = ("All", None) series_select = widgets.Dropdown(options=series_options, description="Series:") year = widgets.Text( value=None, placeholder="filter by year, eg 1940", description="Year:" ) cable = widgets.Text(value=None, placeholder="enter cable number", description="Cable:") display_results = widgets.Output(layout=widgets.Layout(margin="40px 0 0 0")) button = widgets.Button( description="Find files!", button_style="primary", layout=widgets.Layout(margin="20px 0 0 0"), ) button.on_click(run_query) display(HTML("

Find files containing this numbered cable

")) display( widgets.VBox( [ cable, widgets.HTML( "

Filter by series and/or year to reduce the number of results

" ), series_select, year, button, display_results, ] ) )