Display changes in the text of an archived web page over time

This notebook displays changes in the text content of a web page over time. It retrieves a list of available captures from a Memento Timemap, then compares each capture with its predecessor, displaying changes side-by-side.

By default, the notebook only displays lines that have changed. If you want to see more context, you can adjust the parameters in the show_all_differences() function to show lines around each change, or the complete text content.

In [ ]:
# Parameters:

url = ""
archive = ""
In [ ]:
from difflib import HtmlDiff
import requests
from IPython.display import display, HTML
import re
import arrow
from bs4 import BeautifulSoup, Tag
import ipywidgets as widgets
import json
from collections import deque
from urllib.parse import quote
import time

# This is to restyle the standard html table output from difflib
HTML('<style>.diff_add {background-color: #ccffcc;} .diff_chg {background-color: #ffffcc; text-decoration: underline;} .diff_sub {background-color: #ffcccc; text-decoration: line-through;} table.diff, table.diff thead {border: 1px solid black;} table.diff {table-layout: fixed; width: 100%; margin-bottom: 20px;} th.diff_next, td.diff_next {width: 4%;} table.diff th.diff_header {text-align: left;} table.diff tbody {border: none;} table.diff td {word-break: break-word; text-align: left;}</style>')
In [ ]:
%%javascript
// This is necessary in Jupyter notebook to stop the output area folding up
// Will give an error in Jupyter Lab
IPython.OutputArea.prototype._should_scroll = function(lines) {return false}
In [ ]:
# Default list of repositories -- you could add to this
TIMEGATES = {
    'nla': 'https://web.archive.org.au/awa/',
    'nlnz': 'https://ndhadeliver.natlib.govt.nz/webarchive/wayback/',
    'bl': 'https://www.webarchive.org.uk/wayback/archive/',
    'ia': 'https://web.archive.org/web/'
}
In [ ]:
# This deque will only hold a maximum of two values
# So we can just push new pages into it, and it will shove the old ones out the back.
html_data = deque('', 2)

def get_html(url):
    response = requests.get(url)
    # Sometimes the Mementos don't go to captures?!
    # Eg https://web.archive.org.au/awa/20090912180610id_/http://www.discontents.com.au/
    try:
        timestamp = re.search(r'/(\d{14})id_/', response.url).group(1)
    except AttributeError:
        return None
    return {'url': response.url, 'html': response.content}

def format_date(url):
    timestamp = re.search(r'/(\d{14})id_/', url).group(1)
    return arrow.get(timestamp, 'YYYYMMDDHHmmss').format('D MMMM YYYY')

def convert_lists_to_dicts(results):
    '''
    Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.
    Renames keys to standardise IA with other Timemaps.
    '''
    if results:
        keys = results[0]
        results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]
    else:
        results_as_dicts = results
    # Rename keys
    for d in results_as_dicts:
        d['status'] = d.pop('statuscode')
        d['mime'] = d.pop('mimetype')
        d['url'] = d.pop('original')
    return results_as_dicts

def get_capture_data_from_memento(url, request_type='head'):
    '''
    For OpenWayback systems this can get some extra capture info to insert in Timemaps.
    '''
    if request_type == 'head':
        response = requests.head(url)
    else:
        response = requests.get(url)
    headers = response.headers
    length = headers.get('x-archive-orig-content-length')
    status = headers.get('x-archive-orig-status')
    status = status.split(' ')[0] if status else None
    mime = headers.get('x-archive-orig-content-type')
    mime = mime.split(';')[0] if mime else None
    return {'length': length, 'status': status, 'mime': mime}

def convert_link_to_json(results, enrich_data=False):
    '''
    Converts link formatted Timemap to JSON.
    '''
    data = []
    for line in results.splitlines():
        parts = line.split('; ')
        if len(parts) > 1:
            link_type = re.search(r'rel="(original|self|timegate|first memento|last memento|memento)"', parts[1]).group(1)
            if link_type == 'memento':
                link = parts[0].strip('<>')
                timestamp, original = re.search(r'/(\d{14})/(.*)$', link).groups()
                capture = {'timestamp': timestamp, 'url': original}
                if enrich_data:
                    capture.update(get_capture_data_from_memento(link))
                data.append(capture)
    return data
                
def get_timemap_as_json(timegate, url):
    '''
    Get a Timemap then normalise results (if necessary) to return a list of dicts.
    '''
    tg_url = f'{TIMEGATES[timegate]}timemap/json/{url}/'
    response = requests.get(tg_url)
    response_type = response.headers['content-type']
    # pywb style Timemap
    if response_type == 'text/x-ndjson':
        data = [json.loads(line) for line in response.text.splitlines()]
    # IA Wayback stype Timemap
    elif response_type == 'application/json':
        data = convert_lists_to_dicts(response.json())
    # Link style Timemap (OpenWayback)
    elif response_type in ['application/link-format', 'text/html;charset=utf-8']:
        data = convert_link_to_json(response.text)
    return data

def process_text(html):
    '''
    Extract text from an HTML page and return it as a list of lines.
    Removes blank lines.
    '''
    lines = [l for l in BeautifulSoup(html).get_text().splitlines() if not re.match(r'^\s*$', l)]
    return lines

def format_date_link(url):
    date = format_date(url)
    return f'<a href="{url}">{date}</a>'

def show_line_differences(context=True, numlines=0):
    '''
    Use difflib to show a side-by-side comparison of the text in two web pages.
    '''
    differ = HtmlDiff()
    doc1 = process_text(html_data[0]['html'])
    doc2 = process_text(html_data[1]['html'])
    date1 = format_date_link(html_data[0]['url'])
    date2 = format_date_link(html_data[1]['url'])
    html = differ.make_table(doc1, doc2, context=context, numlines=numlines, fromdesc=date1, todesc=date2)
    # Rewrite the table html to make the column widths work better
    html = html.replace(r'<th colspan="2" class="diff_header"', '<th class="diff_next"></th><th class="diff_header"')
    # Cleaning up the table output
    html = html.replace('nowrap="nowrap"', '')
    html = html.replace('<tbody>', '').replace('</tbody>', '')
    with out:
        display(HTML(html))

def show_all_differences(timegate, url):
    '''
    Get all captures for a page from a Timemap, then compare each page with its predecessor,
    display changes side-by-side.
    '''
    global html_data
    timemap = get_timemap_as_json(timegate, url)
    with out:
        key = '<b>Key</b><ul><li><span class="diff_sub">deleted text</span></li><li><span class="diff_chg">changed text</span></li><li><span class="diff_add">added text</li></ul>'
        display(HTML(key))
    for i, capture in enumerate(timemap):
        capture_url = f'{TIMEGATES[timegate]}{capture["timestamp"]}id_/{capture["url"]}'
        if timegate == 'nlnz' or (capture['digest'] != timemap[i-1]['digest'] and capture['status'] == '200'):
            capture_data = get_html(capture_url)
            if capture_data:
                html_data.append(capture_data)
                if len(html_data) == 2:
                    # You could change the params below to show context around changes
                    # context=False -- shows the whole document
                    # numlines -- when context=True, the number of lines to show around the diff 
                    # numline=0 -- just the diffs
                    show_line_differences(context=True, numlines=0)

def share_this():
    binder_url = 'https://mybinder.org/v2/gh/GLAM-Workbench/web-archives/master?urlpath=/apps/display-text-changes-from-timemap.ipynb'
    parameter_string = quote(f'?url="{target_url.value}"&archive="{repository.value}"')
    share_url = f'{binder_url}{parameter_string}'
    with out:
        display(HTML(f'<p>Share this: <a href="{share_url}">{share_url}</a></p>'))

def start(e):
    clear('e')
    show_all_differences(repository.value, target_url.value)
    share_this()
    

def clear(e):
    global html_data
    html_data.clear()
    out.clear_output()
    
out = widgets.Output()

repository = widgets.Dropdown(
    options=[('---', ''), ('UK Web Archive', 'bl'), ('National Library of Australia', 'nla'), ('National Library of New Zealand', 'nlnz'), ('Internet Archive', 'ia')],
    description='Archive:',
    disabled=False,
    value=archive
)

target_url = widgets.Text(description='URL:', value=url)

tc_button = widgets.Button(description='Show text changes', button_style='primary')
tc_button.on_click(start)
clear_button = widgets.Button(description='Clear all')
clear_button.on_click(clear)

display(widgets.HBox([repository, target_url], layout=widgets.Layout(padding='10px')))
display(widgets.HBox([tc_button, clear_button], layout=widgets.Layout(padding='10px')))
display(out)

if archive and url:
    start('e')

Created by Tim Sherratt for the GLAM Workbench.

Work on this notebook was supported by the IIPC Discretionary Funding Programme 2019-2020