Somewhat confusingly, the Australian Women's Weekly is in with Trove's digitised newspapers and not the rest of the magazines. There are notebooks in the GLAM Workbench's journals section to help harvest all of a journal's covers as images, so I thought I should do the same for the Weekly.
Just change the TITLE_ID
, START_DATE
, END_DATE
, and PREFIX
, to harvest all the front pages of any digitised newspaper.
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
from pathlib import Path
from tqdm.auto import tqdm
import time
import pandas as pd
from IPython.display import display, FileLink
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))
Modify the values below as required.
API_KEY = 'YOUR API KEY'
# The id of the newspaper you want to harvest
TITLE_ID = '112' # 112 is the AWW
# Range of years to harvest
START_YEAR = 1933
END_YEAR = 1983
# A prefix to use in file names, if None then the title_id will be used
PREFIX = 'aww'
TITLE_URL = f'https://api.trove.nla.gov.au/v2/newspaper/title/{TITLE_ID}'
def get_current_year(years, year):
'''
Get data for the current year from the dictionary of years.
'''
for year_data in years:
if year_data['date'] == str(year):
return year_data
def get_issues():
'''
Get all the issue details by looping through the range of years.
Returns a list of issues.
'''
params = {
'encoding': 'json',
'include': 'years',
'key': API_KEY
}
issues = []
for year in tqdm(range(START_YEAR, END_YEAR), desc='Issues'):
# Setting 'range' tells the API to give us a list of issue dates & urls within that date range
date_range = f'{year}0101-{year}1231'
params['range'] = date_range
# Get the data
response = s.get(TITLE_URL, params=params)
data = response.json()
# Extract the details for the current year
year_data = get_current_year(data['newspaper']['year'], year)
# Save issue details
for issue in year_data['issue']:
issues.append(issue)
time.sleep(0.2)
return issues
def get_file_prefix():
'''
Set the prefix to be used in filenames and data directory.
Defaults to title id if prefix is not set
'''
if PREFIX:
file_prefix = PREFIX
else:
file_prefix = TITLE_ID
return file_prefix
def create_output_dir(file_prefix):
'''
Create output directory.
'''
dir_path = Path('data', file_prefix)
dir_path.mkdir(parents=True, exist_ok=True)
return dir_path
def download_page(page_id, size, file_path):
'''
Download page image using the supplied id.
Size range is 1 to 7 (7 being the highest res)
'''
# Format the page url ising the page id
page_url = f'http://trove.nla.gov.au/ndp/imageservice/nla.news-page{page_id}/level{size}'
# Download the image
response = s.get(page_url)
file_path.write_bytes(response.content)
time.sleep(0.5)
def harvest_covers(size=5):
'''
Get a list of issues of the title.
Loop through the issues downloading each front page/cover.
Return issue metadata.
'''
# Get a list of issues
issues = get_issues()
# Loop through the issues
for issue in tqdm(issues, desc='Pages'):
# Request the issue url
response = s.get(issue['url'])
# The issue url will be redirected to a page url
# Extract the page id from the page url
page_id = re.search(r'(\d+)$', response.url).group(1)
# Save page id to metadata
issue['page_id'] = page_id
# Set up dirs and files
file_prefix = get_file_prefix()
dir_path = create_output_dir(file_prefix)
file_path = Path(dir_path, f'{file_prefix}-{issue["date"].replace("-", "")}-page{page_id}.jpg')
# If the image hasn't already been downloaded, then download it!
if not file_path.exists():
download_page(page_id, size, file_path)
# Save the image name to the metadata
issue['image_name'] = file_path.name
time.sleep(0.2)
return issues
issues = harvest_covers()
df = pd.DataFrame(issues)
df.rename(columns = {'id': 'issue_id'}, inplace=True)
df.head()
file_prefix = get_file_prefix()
df.to_csv(f'data/{file_prefix}-issues.csv', index=False)
display(FileLink(f'data/{file_prefix}-issues.csv'))
Created by Tim Sherratt for the GLAM Workbench.