Notebook

Display changes in the text of an archived web page over time¶

In [ ]:

# This notebook is designed to run in Voila as an app (with the code hidden).
# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'
# Your browser might ask for permission to open the new tab as a popup.

This notebook displays changes in the text content of a web page over time. It retrieves a list of available captures from a Memento Timemap, then compares each capture with its predecessor, displaying changes side-by-side.

By default, the notebook only displays lines that have changed. If you want to see more context, you can adjust the parameters in the show_all_differences() function to show lines around each change, or the complete text content.

In [ ]:

import json
import os
import re
from collections import deque
from difflib import HtmlDiff
from urllib.parse import parse_qs, quote

import arrow
import ipywidgets as widgets
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display

# This is to restyle the standard html table output from difflib
HTML(
    "<style>.diff_add {background-color: #ccffcc;} .diff_chg {background-color: #ffffcc; text-decoration: underline;} .diff_sub {background-color: #ffcccc; text-decoration: line-through;} table.diff, table.diff thead {border: 1px solid black;} table.diff {table-layout: fixed; width: 100%; margin-bottom: 20px;} th.diff_next, td.diff_next {width: 4%;} table.diff th.diff_header {text-align: left;} table.diff tbody {border: none;} table.diff td {word-break: break-word; text-align: left;}</style>"
)

In [ ]:

# Default list of repositories -- you could add to this
TIMEGATES = {
    "nla": "https://web.archive.org.au/awa/",
    "nlnz": "https://ndhadeliver.natlib.govt.nz/webarchive/wayback/",
    "bl": "https://www.webarchive.org.uk/wayback/archive/",
    "ia": "https://web.archive.org/web/",
    "ukgwa": "https://webarchive.nationalarchives.gov.uk/ukgwa/"
}

In [ ]:

# This deque will only hold a maximum of two values
# So we can just push new pages into it, and it will shove the old ones out the back.
html_data = deque("", 2)


def get_html(url):
    response = requests.get(url)
    # Sometimes the Mementos don't go to captures?!
    # Eg https://web.archive.org.au/awa/20090912180610id_/http://www.discontents.com.au/
    try:
        re.search(r"/(\d{12}|\d{14})id_/", response.url).group(1)
    except AttributeError:
        return None
    return {"url": response.url, "html": response.content}


def format_date(url):
    timestamp = re.search(r"/(\d{12}|\d{14})id_/", url).group(1)
    return arrow.get(timestamp, "YYYYMMDDHHmmss").format("D MMMM YYYY")


def convert_lists_to_dicts(results):
    """
    Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.
    Renames keys to standardise IA with other Timemaps.
    """
    if results:
        keys = results[0]
        results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]
    else:
        results_as_dicts = results
    # Rename keys
    for d in results_as_dicts:
        d["status"] = d.pop("statuscode")
        d["mime"] = d.pop("mimetype")
        d["url"] = d.pop("original")
    return results_as_dicts


def get_capture_data_from_memento(url, request_type="head"):
    """
    For OpenWayback systems this can get some extra capture info to insert in Timemaps.
    """
    if request_type == "head":
        response = requests.head(url)
    else:
        response = requests.get(url)
    headers = response.headers
    length = headers.get("x-archive-orig-content-length")
    status = headers.get("x-archive-orig-status")
    status = status.split(" ")[0] if status else None
    mime = headers.get("x-archive-orig-content-type")
    mime = mime.split(";")[0] if mime else None
    return {"length": length, "status": status, "mime": mime}


def convert_link_to_json(results, enrich_data=False):
    """
    Converts link formatted Timemap to JSON.
    """
    data = []
    for line in results.splitlines():
        parts = line.split("; ")
        if len(parts) > 1:
            link_type = re.search(
                r'rel="(original|self|timegate|first memento|last memento|memento)"',
                parts[1],
            ).group(1)
            if link_type == "memento":
                link = parts[0].strip("<>")
                timestamp, original = re.search(r"/(\d{12}|\d{14})/(.*)$", link).groups()
                capture = {"timestamp": timestamp, "url": original}
                if enrich_data:
                    capture.update(get_capture_data_from_memento(link))
                data.append(capture)
    return data


def get_timemap_as_json(timegate, url):
    """
    Get a Timemap then normalise results (if necessary) to return a list of dicts.
    """
    tg_url = f"{TIMEGATES[timegate]}timemap/json/{url}/"
    response = requests.get(tg_url)
    response_type = response.headers["content-type"]
    # pywb style Timemap
    if response_type == "text/x-ndjson":
        data = [json.loads(line) for line in response.text.splitlines()]
    # IA Wayback stype Timemap
    elif response_type == "application/json":
        data = convert_lists_to_dicts(response.json())
    # Link style Timemap (OpenWayback)
    elif response_type in ["application/link-format", "text/html;charset=utf-8"]:
        data = convert_link_to_json(response.text)
    return data


def process_text(html):
    """
    Extract text from an HTML page and return it as a list of lines.
    Removes blank lines.
    """
    lines = [
        line
        for line in BeautifulSoup(html).get_text().splitlines()
        if not re.match(r"^\s*$", line)
    ]
    return lines


def format_date_link(url):
    date = format_date(url)
    return f'<a href="{url}">{date}</a>'


def show_line_differences(context=True, numlines=0):
    """
    Use difflib to show a side-by-side comparison of the text in two web pages.
    """
    differ = HtmlDiff()
    doc1 = process_text(html_data[0]["html"])
    doc2 = process_text(html_data[1]["html"])
    date1 = format_date_link(html_data[0]["url"])
    date2 = format_date_link(html_data[1]["url"])
    html = differ.make_table(
        doc1, doc2, context=context, numlines=numlines, fromdesc=date1, todesc=date2
    )
    # Rewrite the table html to make the column widths work better
    html = html.replace(
        r'<th colspan="2" class="diff_header"',
        '<th class="diff_next"></th><th class="diff_header"',
    )
    # Cleaning up the table output
    html = html.replace('nowrap="nowrap"', "")
    html = html.replace("<tbody>", "").replace("</tbody>", "")
    with out:
        display(HTML(html))


def show_all_differences(timegate, url):
    """
    Get all captures for a page from a Timemap, then compare each page with its predecessor,
    display changes side-by-side.
    """
    global html_data
    timemap = get_timemap_as_json(timegate, url)
    with out:
        key = '<b>Key</b><ul><li><span class="diff_sub">deleted text</span></li><li><span class="diff_chg">changed text</span></li><li><span class="diff_add">added text</li></ul>'
        display(HTML(key))
    for i, capture in enumerate(timemap):
        capture_url = f'{TIMEGATES[timegate]}{capture["timestamp"]}id_/{capture["url"]}'
        if timegate == "nlnz" or (
            capture["digest"] != timemap[i - 1]["digest"] and capture["status"] == "200"
        ):
            capture_data = get_html(capture_url)
            if capture_data:
                html_data.append(capture_data)
                if len(html_data) == 2:
                    # You could change the params below to show context around changes
                    # context=False -- shows the whole document
                    # numlines -- when context=True, the number of lines to show around the diff
                    # numline=0 -- just the diffs
                    show_line_differences(context=True, numlines=0)


def share_this():
    binder_url = "https://mybinder.org/v2/gh/GLAM-Workbench/web-archives/master?urlpath=/voila/render/display-text-changes-from-timemap.ipynb"
    parameter_string = quote(f"?url={target_url.value}&archive={repository.value}")
    share_url = f"{binder_url}{parameter_string}"
    with out:
        display(HTML(f'<p>Share this: <a href="{share_url}">{share_url}</a></p>'))


def start(e):
    clear("e")
    show_all_differences(repository.value, target_url.value)
    share_this()


def clear(e):
    global html_data
    html_data.clear()
    out.clear_output()


query_string = os.environ.get("QUERY_STRING", "")
parameters = parse_qs(query_string)
url = parameters.get("url", [""])[0]
archive = parameters.get("archive", [""])[0]

out = widgets.Output()

repository = widgets.Dropdown(
    options=[
        ("---", ""),
        ("UK Web Archive", "bl"),
        ("UK Government Web Archive", "ukgwa"),
        ("National Library of Australia", "nla"),
        ("National Library of New Zealand", "nlnz"),
        ("Internet Archive", "ia"),
    ],
    description="Archive:",
    disabled=False,
    value=archive,
)

target_url = widgets.Text(description="URL:", value=url)

tc_button = widgets.Button(
    description="Show text changes",
    button_style="primary",
)
tc_button.on_click(start)
clear_button = widgets.Button(description="Clear all")
clear_button.on_click(clear)

display(widgets.HBox([repository, target_url], layout=widgets.Layout(padding="10px")))
display(widgets.HBox([tc_button, clear_button], layout=widgets.Layout(padding="10px")))
display(out)

In [ ]:

%%capture
%load_ext dotenv
%dotenv

# Insert some values for automated testing

if os.getenv("GW_STATUS") == "dev":
    url = "http://discontents.com.au/2017-the-making-and-the-talking/"
    archive = "ia"

    target_url.value = url
    repository.value = archive

In [ ]:

# If values have been provided via url or above, then start automatically.
# Note that Voila widgets don't load immediately, hence the polling to
# make sure the start button exists.

if url and archive:
    script = """
    <script type="text/javascript">
        function start() {
          if (document.querySelector("button")) {
            let button = document.querySelector("button.mod-primary");
            button.click();
          } else {
            setTimeout(start, 5);
          }
        }
    start();
    </script>"""
    display(HTML(script))

Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!

Work on this notebook was supported by the IIPC Discretionary Funding Programme 2019-2020.

The Web Archives section of the GLAM Workbench is sponsored by the British Library.