This notebook includes code that will enable you to harvest individual record details from a search in the National Archives of New Zealand's online database, Archway.
If you search for keywords only there's a limit of 1,000 results returned. But it seems that if you add extra parameters, such as a date range, the maximum number of results returned is 10,000.
If you want to harvest more records than this, you'll need to break your search up into chunks of less than 10,000. I'll give some possible strategies for this below.
If you're not familiar with Jupyter notebooks like this one, here's a few basic tips.
Shift+Enter
to execute the code.# Import the modules we'll need
# Yes this is a code cell, hit Shift+Enter to run it!
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
from IPython.display import display, HTML, FileLink
from tqdm import tqdm, tqdm_notebook
s = requests.Session()
# These are the default search parameters for the Advanced search page
# Don't change this here. We'll add values below.
# You still need to run it though -- so Shift+Enter again!
params = {
'accessionNumber': '',
'agencyCode': '',
'alternativeRecordNumber': '',
'boxNumber': '',
'code': '',
'endYear': '',
'exclude': '',
'excludeSearchTypeID': '2', #2=any, 3=exact
'format': 'All', # Options: All, Artwork, map/plan, Moving Image, Not determined, Object, Photograph, Sound Recording, text
'formerArchivesRef': '',
'heldauckland': 'on',
'heldchristchurch': 'on',
'helddigitalrepository': 'on',
'helddunedin': 'on',
'heldother': 'on',
'heldwellington': 'on',
'includeUnknown': 'on',
'keyword': '',
'keywordSearchTypeID': '1', #1=all, 2=any, 3=exact
'performSearchImageButton.x': '53',
'performSearchImageButton.y': '8',
'recordNumber': '',
'sepNumber': '',
'seriesNumber': '',
'startYear': ''
}
# These are the default fields for an item.
# I'm assuming they're consistent!
details_fields = [
'Item ID',
'Agency',
'Series',
'Accession',
'Record group',
'Box/Item',
'Sep',
'Record no.',
'Part',
'Alternative no.',
'Record type'
]
You shouldn't have to edit anything here. Just run the code cell to load everything up.
# Yep, you guessed it, hit Shift+Enter again! Seeing a pattern here?
def strip_string(cell):
'''
If there's a string in a cell, strip all the whitespace.
'''
if cell.string:
return cell.string.strip()
else:
return ''
def process_item_page(response):
'''
Extract details from an individual record page.
'''
details = {}
soup = BeautifulSoup(response.text, 'lxml')
title_row = soup.find('td', string='Title').parent.find_next_sibling('tr')
title_cells = title_row.find_all('td')
details['Title'] = title_cells[0].get_text().strip()
details['Date'] = strip_string(title_cells[1])
details_row = soup.find('td', string='Item ID').parent.find_next_sibling('tr')
details_cells = details_row.find_all('td')
for index, field in enumerate(details_fields):
details[field] = strip_string(details_cells[index])
former_row = soup.find('td', string='Former archives ref').parent.find_next_sibling('tr')
details['Former archives ref'] = strip_string(former_row.td)
details['Access status'] = strip_string(soup.find(class_='restriction-text').strong)
return details
def process_page(soup):
'''
Work through a page of search results, getting the details of each individual record/
'''
results = []
links = soup.find_all('a', href=re.compile('ViewFullItem.do'))
for link in tqdm_notebook(links, leave=False):
id = re.search(r'ViewFullItem\.do\?code=(\d+)', link['href']).group(1)
item_url = 'https://www.archway.archives.govt.nz/ViewFullItem.do?code=' + id
response = s.get(item_url)
results.append(process_item_page(response))
time.sleep(0.2)
return results
def prepare_search(params):
'''
Gathering the cookies and session details...
'''
# It's probably not necessary to step through the search pages like this.
# I was just getting paranoid about sessions and cookies...
r1 = s.get('https://www.archway.archives.govt.nz/')
r2 = s.get('https://www.archway.archives.govt.nz/CallAdvancedSearch.do')
r3 = s.get('https://www.archway.archives.govt.nz/CallItemAdvancedSearch.do')
r4 = s.post('https://www.archway.archives.govt.nz/ItemAdvancedSearch.do', data=params)
soup = BeautifulSoup(r4.text, 'lxml')
params = get_page_params(soup, 1)
try:
total_results = int(params['searchResultsContainer.totalResultSize'])
except KeyError:
total_results = 0
return total_results
def get_page_params(soup, page):
'''
Get the embedded search details in a results page to feed to the next page request.
'''
params = {}
elements = soup.find_all('input', {'name': re.compile('searchResultsContainer'), 'type': 'hidden'})
for element in elements:
# print(element)
params[element['name']] = element['value']
params['searchResultsContainer.page'] = page
return params
def harvest_results(params):
'''
Harvest results using the supplied parameters.
'''
total_results = prepare_search(params)
# Set up some defaults
page = 1
results = []
# Loop through the results pages, extracting details of individual records
with tqdm_notebook(total=total_results, leave=False, unit='record') as pbar:
while len(results) < total_results:
search_response = s.post('https://www.archway.archives.govt.nz/ItemAdvancedSearchResults.do', data=params)
soup = BeautifulSoup(search_response.text, 'lxml')
new_results = process_page(soup)
results += new_results
page += 1
# Need these params to get the next page of results
params = get_page_params(soup, page)
time.sleep(0.5)
pbar.update(len(new_results))
return results
You need to feed your search terms into the parameters defined above. For example, to search for the keyword Chinese
, you'd enter the code:
query_params['keyword'] = 'Chinese'
To search for items in a particular series you'd enter:
query_params['seriesNumber'] = '8333'
To search for an item with a particular record number you'd enter:
query_params['recordNumber'] = '1883/3052'
You can set multiple parameters.
# Make a copy of the default params
query_params = params.copy()
# Let's set some parameters -- edit as you see fit
# Once you've finished editing, hit Shift-Enter
# query_params['keyword'] = 'Chinese'
query_params['seriesNumber'] = '8333'
query_params['keyword'] = 'naturalisation naturalization'
# Make it an 'any' search
query_params['keywordSearchTypeID'] = 2
query_params['startYear'] = '1840'
query_params['endYear'] = '1905'
# This excludes records without a date
query_params['includeUnknown'] = False
# Run this cell (Shift+Enter) to kick things off
# When the asterix turns to a number in the square brackets, your harvest will have finished
results = harvest_results(query_params)
So what do you do if you want to harvest more than the limit of 10,000 records? Basically, you need to think about ways of breaking the search up into smaller chunks. From my brief explorations it seems that the record number
field supports wildcard searches, so if the records were numbered with prefixes between 1 and 100, for example, you could try something like this:
results = []
for prefix in range(1, 101):
params['recordNumber'] = '{}/*'.format(prefix)
results += harvest_results(params)
Alternatively, you could break your date span up into individual years. Note that this will likely mean that you'll have duplicate records in your dataset, but that can be easily fixed in Pandas using .drop_duplicates()
. Also note that I've compared the results of searches broken down by year with searches for a continuous data span and, for some reason, the year by year searches return fewer results. I don't know why.
# Make a copy of the default params
query_params = params.copy()
query_params['seriesNumber'] = '8333'
query_params['keyword'] = 'naturalisation naturalization'
# Make it an 'any' search
query_params['keywordSearchTypeID'] = 2
results = []
for start_year in tqdm_notebook(range(1840, 1906), unit='year'):
query_params['startYear'] = start_year
query_params['endYear'] = start_year
# This excludes records without a date
query_params['includeUnknown'] = False
results += harvest_results(query_params)
# Pandas makes it stupidly easy to save data as a CSV
# First convert the results into a DataFrame
df = pd.DataFrame(results)
# If you think there might be duplicates run this to get rid of them
df.drop_duplicates()
# Save as a CSV with good old Shift+Enter
csv_filename = 'results.csv' # change this to whatever you want
df.to_csv(csv_filename, index=False)
display(FileLink(csv_filename))