Get full page screenshots from archived web pages

View in GitHub · View in GLAM Workbench

In [ ]:
# This notebook is designed to run in Voila as an app (with the code hidden).
# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'
# Your browser might ask for permission to open the new tab as a popup.
In [ ]:
%%capture
from selenium import webdriver
import requests
import selenium
from PIL import Image
import PIL
import io
import base64
import time
import re
import math
import arrow
from slugify import slugify
from webdriverdownloader import GeckoDriverDownloader
from pathlib import Path
from IPython.display import display, HTML
from urllib.parse import urlparse
import ipywidgets as widgets

gdd = GeckoDriverDownloader()
geckodriver = gdd.download_and_install("v0.26.0")[1]
In [ ]:
TIMEGATES = {
    'nla': 'https://web.archive.org.au/awa/',
    'nlnz': 'https://ndhadeliver.natlib.govt.nz/webarchive/wayback/',
    'bl': 'https://www.webarchive.org.uk/wayback/archive/',
    'ia': 'https://web.archive.org/web/'
}

wayback = ['ndhadeliver.natlib.govt.nz', 'web.archive.org']
pywb = {'web.archive.org.au': 'replayFrame', 'webarchive.nla.gov.au': 'replayFrame', 'webarchive.org.uk': 'replay_iframe'}

html_output = []

def format_date_for_headers(iso_date, tz):
    '''
    Convert an ISO date (YYYY-MM-DD) to a datetime at noon in the specified timezone.
    Convert the datetime to UTC and format as required by Accet-Datetime headers:
    eg Fri, 23 Mar 2007 01:00:00 GMT
    '''
    local = arrow.get(f'{iso_date} 12:00:00 {tz}', 'YYYY-MM-DD HH:mm:ss ZZZ')
    gmt = local.to('utc')
    return f'{gmt.format("ddd, DD MMM YYYY HH:mm:ss")} GMT'

def format_date_from_timestamp(url):
    timestamp = re.search(r'/(\d{14})(?:if_|mp_)*/', url).group(1)
    return arrow.get(timestamp, 'YYYYMMDDHHmmss').format('D MMMM YYYY')

def parse_links_from_headers(response):
    '''
    Extract original, timegate, timemap, and memento links from 'Link' header.
    '''
    links = response.links
    return {k: v['url'] for k, v in links.items()}

def query_timegate(timegate, url, date=None, tz='Australia/Canberra'):
    headers = {}
    if date:
        formatted_date = format_date_for_headers(date, tz)
        headers['Accept-Datetime'] = formatted_date
    # BL & NLNZ don't seem to default to latest date if no date supplied
    elif not date and timegate in ['bl', 'nlnz']:
        formatted_date = format_date_for_headers(arrow.utcnow().format('YYYY-MM-DD'), tz)
        headers['Accept-Datetime'] = formatted_date
    # Note that you don't get a timegate response if you leave off the trailing slash, but extras don't hurt!
    tg_url = f'{TIMEGATES[timegate]}{url}/' if not url.endswith('/') else f'{TIMEGATES[timegate]}{url}'
    # print(tg_url)
    # IA doesn't work with head, others don't work with get...
    if timegate == 'ia':
        response = requests.get(tg_url, headers=headers)
    else:
        response = requests.head(tg_url, headers=headers)
    return parse_links_from_headers(response)

def get_memento(timegate, url, date):
    links = query_timegate(timegate, url, date)
    # NLNZ doesn't always seem to return a Memento, so we'll build in some fuzziness
    if links:
        if 'memento' in links:
            memento = links['memento']
        elif 'prev memento' in links:
            memento = links['prev memento']
        elif 'next memento' in links:
            memento = links['next memento']
        elif 'last memento' in links:
            memento = links['last memento']
    else:
        memento = None
    return memento
    
def get_full_page_screenshot(url, save_width=200):
    '''
    Gets a full page screenshot of the supplied url.
    By default resizes the screenshot to a maximum width of 200px.
    Provide a 'save_width' value to change this.
    
    NOTE the webdriver sometimes fails for unknown reasons. Just try again.
    '''
    global html_output
    domain = urlparse(url)[1].replace('www.', '')
    # NZ and IA inject content into the page, so we use if_ to get the original page (with rewritten urls)
    if domain in wayback and 'if_' not in url:
        url = re.sub(r'/(\d{14})/http', r'/\1if_/http', url)
    try:
        date_str, site = re.search(r'/(\d{14})(?:if_|mp_)*/https*://(.+/)', url).groups()
    except AttributeError:
        # There's something wrong with the link...
        # print(url)
        show_error(f'{url} isn\'t a Memento – did you forget to select an archive?')
    else:
        output_dir = Path('screenshots')
        output_dir.mkdir(parents=True, exist_ok=True)
        ss_file = Path(output_dir, f'{slugify(site)}-{date_str}-{save_width}.png')
        options = webdriver.FirefoxOptions()
        options.headless = True
        driver = webdriver.Firefox(executable_path=geckodriver, options=options)
        driver.implicitly_wait(15)
        driver.get(url)
        # Give some time for everything to load
        time.sleep(30)
        driver.maximize_window()
        current_width = driver.get_window_size()['width']
        # UK and AU use pywb in framed replay mode, so we need to switch to the framed content
        if domain in pywb:
            try:
                driver.switch_to.frame(pywb[domain])
            except selenium.common.exceptions.NoSuchFrameException:
                # If we pass here we'll probably still get a ss, just not full page -- better than failing?
                pass
        ss = None
        for tag in ['body', 'html', 'frameset']:
            try:
                elem = driver.find_element_by_tag_name(tag)
                ss = elem.screenshot_as_base64
                break
            except (selenium.common.exceptions.NoSuchElementException, selenium.common.exceptions.WebDriverException):
                pass
        driver.quit()
        if not ss:
            show_error(f'Couldn\'t get a screenshot of {url} – sorry...')
        else:
            img = Image.open(io.BytesIO(base64.b64decode(ss)))
            ratio = save_width / img.width
            (width, height) = (save_width, math.ceil(img.height * ratio))
            resized_img = img.resize((width, height), PIL.Image.LANCZOS)
            resized_img.save(ss_file)
            return ss_file
        
def display_screenshot(ss_file, url):
    date = format_date_from_timestamp(url)
    try:
        display_url = re.search(r'/\d{14}(?:mp_|if_|id_)*/(.*)$', url).group(1)
    except AttributeError:
        display_url = url
    status.clear_output()
    html_output.append(f'<div style="float:left; margin-left: 20px;"><p><b>{date}</b><br><a href="{url.replace("if_/", "/")}">{display_url}</a></p><p><a href="{ss_file}"><img src="{ss_file}"></a><br><a href="{ss_file}">[Download]</a></p></div>')
    with out:
        display((HTML(''.join(html_output))))

def show_error(message=None):
    status.clear_output()
    with status:
        print(f'Something went wrong – {message}')

def start(e):
    status.clear_output()
    out.clear_output(wait=True)
    with status:
        print('Generating screenshot...')
    if repository.value:
        memento = get_memento(repository.value, target_url.value, target_date.value)
    else:
        memento = target_url.value
    if memento:
        try:
            ss_file = get_full_page_screenshot(memento, save_width=width.value)
            if ss_file:
                display_screenshot(ss_file, memento)
        except selenium.common.exceptions.WebDriverException:
            show_error(f'couldn\'t get a screenshot of {memento} – sorry...')
    else:
        show_error('couldn\'t find a Memento – sorry...')
    
def clear(e):
    global html_output
    html_output = []
    status.clear_output()
    out.clear_output()
    
def clear_last(e):
    global html_output
    html_output.pop()
    out.clear_output(wait=True)
    with out:
        display((HTML(''.join(html_output))))

repository = widgets.Dropdown(
    options=[('---', ''), ('UK Web Archive', 'bl'), ('National Library of Australia', 'nla'), ('National Library of New Zealand', 'nlnz'), ('Internet Archive', 'ia')],
    description='Archive:',
    disabled=False,
)

target_url = widgets.Text(description='Target URL:')

target_date = widgets.DatePicker(
    description='Target date: ',
    disabled=False
)

width = widgets.IntSlider(
    value=7,
    min=200,
    max=1000,
    step=100,
    description='Width:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

out = widgets.Output()
status = widgets.Output()
ss_button = widgets.Button(description='Get screenshot', button_style='primary')
ss_button.on_click(start)
clear_button = widgets.Button(description='Clear all')
clear_button.on_click(clear)
clear_last_button = widgets.Button(description='Clear last')
clear_last_button.on_click(clear_last)
note = '''
    <ul class="browser-default">
    <li>Select a repository, and insert a url to generate a screenshot from the archive.</li>
    <li>If you include a date, it\'ll attempt to find the closest capture using Memento Timegates.</li>
    <li>If you don't include a date, it'll give you the most recent capture.</li>
    <li>If you already have the url of the exact capture you want, just put it in the 'Target url' box and leave 'Archive' and 'Target date' blank.
    <li>You can add multiple screenshots to compare changes.</li>
    </ul>
    '''
display(HTML(note), widgets.HBox([widgets.VBox([repository, target_date]), widgets.VBox([target_url, width])], layout=widgets.Layout(padding='20px')), widgets.HBox([ss_button, clear_button, clear_last_button]), status, out)

Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!

Work on this notebook was supported by the IIPC Discretionary Funding Programme 2019-2020