Harvesting Australian Women's Weekly covers

(or all the front pages of any digitised newspaper)

Somewhat confusingly, the Australian Women's Weekly is in with Trove's digitised newspapers and not the rest of the magazines. There are notebooks in the GLAM Workbench's journals section to help harvest all of a journal's covers as images, so I thought I should do the same for the Weekly.

Just change the TITLE_ID, START_DATE, END_DATE, and PREFIX, to harvest all the front pages of any digitised newspaper.

Harvest summary

Import what we need

In [ ]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
from pathlib import Path
from tqdm.auto import tqdm
import time
import pandas as pd
from IPython.display import display, FileLink

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

Set some options

Modify the values below as required.

In [ ]:
API_KEY = 'YOUR API KEY'

# The id of the newspaper you want to harvest
TITLE_ID = '112' # 112 is the AWW

# Range of years to harvest
START_YEAR = 1933
END_YEAR = 1983

# A prefix to use in file names, if None then the title_id will be used
PREFIX = 'aww'

Define some functions

In [ ]:
TITLE_URL = f'https://api.trove.nla.gov.au/v2/newspaper/title/{TITLE_ID}'

def get_current_year(years, year):
    '''
    Get data for the current year from the dictionary of years.
    '''
    for year_data in years:
        if year_data['date'] == str(year):
            return year_data

def get_issues():
    '''
    Get all the issue details by looping through the range of years.
    Returns a list of issues.
    '''
    params = {
        'encoding': 'json',
        'include': 'years',
        'key': API_KEY
    }
    issues = []
    for year in tqdm(range(START_YEAR, END_YEAR), desc='Issues'):
        # Setting 'range' tells the API to give us a list of issue dates & urls within that date range
        date_range = f'{year}0101-{year}1231'
        params['range'] = date_range
        # Get the data
        response = s.get(TITLE_URL, params=params)
        data = response.json()
        # Extract the details for the current year
        year_data = get_current_year(data['newspaper']['year'], year)
        # Save issue details
        for issue in year_data['issue']:
            issues.append(issue)
        time.sleep(0.2)
    return issues

def get_file_prefix():
    '''
    Set the prefix to be used in filenames and data directory.
    Defaults to title id if prefix is not set
    '''
    if PREFIX:
        file_prefix = PREFIX
    else:
        file_prefix = TITLE_ID
    return file_prefix

def create_output_dir(file_prefix):
    '''
    Create output directory.
    '''
    dir_path = Path('data', file_prefix)
    dir_path.mkdir(parents=True, exist_ok=True)
    return dir_path

def download_page(page_id, size, file_path):
    '''
    Download page image using the supplied id.
    Size range is 1 to 7 (7 being the highest res)
    '''
    # Format the page url ising the page id
    page_url = f'http://trove.nla.gov.au/ndp/imageservice/nla.news-page{page_id}/level{size}'
    # Download the image
    response = s.get(page_url)
    file_path.write_bytes(response.content)
    time.sleep(0.5)

def harvest_covers(size=5):
    '''
    Get a list of issues of the title.
    Loop through the issues downloading each front page/cover.
    Return issue metadata.
    '''
    # Get a list of issues
    issues = get_issues()
    # Loop through the issues
    for issue in tqdm(issues, desc='Pages'):
        # Request the issue url
        response = s.get(issue['url'])
        # The issue url will be redirected to a page url
        # Extract the page id from the page url
        page_id = re.search(r'(\d+)$', response.url).group(1)
        # Save page id to metadata
        issue['page_id'] = page_id
        # Set up dirs and files
        file_prefix = get_file_prefix()
        dir_path = create_output_dir(file_prefix)
        file_path = Path(dir_path, f'{file_prefix}-{issue["date"].replace("-", "")}-page{page_id}.jpg')
        # If the image hasn't already been downloaded, then download it!
        if not file_path.exists():
            download_page(page_id, size, file_path)
        # Save the image name to the metadata
        issue['image_name'] = file_path.name
        time.sleep(0.2)
    return issues

Run the harvest!

In [ ]:
issues = harvest_covers()

Save the metadata

In [ ]:
df = pd.DataFrame(issues)
df.rename(columns = {'id': 'issue_id'}, inplace=True)
df.head()
In [ ]:
file_prefix = get_file_prefix()
df.to_csv(f'data/{file_prefix}-issues.csv', index=False)
display(FileLink(f'data/{file_prefix}-issues.csv'))

Created by Tim Sherratt for the GLAM Workbench.