# This notebook is designed to run in Voila as an app (with the code hidden).
# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'
# Your browser might ask for permission to open the new tab as a popup.
This notebook displays changes in the text content of a web page over time. It retrieves a list of available captures from a Memento Timemap, then compares each capture with its predecessor, displaying changes side-by-side.
By default, the notebook only displays lines that have changed. If you want to see more context, you can adjust the parameters in the show_all_differences()
function to show lines around each change, or the complete text content.
import json
import os
import re
from collections import deque
from difflib import HtmlDiff
from urllib.parse import parse_qs, quote
import arrow
import ipywidgets as widgets
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display
# This is to restyle the standard html table output from difflib
HTML(
"<style>.diff_add {background-color: #ccffcc;} .diff_chg {background-color: #ffffcc; text-decoration: underline;} .diff_sub {background-color: #ffcccc; text-decoration: line-through;} table.diff, table.diff thead {border: 1px solid black;} table.diff {table-layout: fixed; width: 100%; margin-bottom: 20px;} th.diff_next, td.diff_next {width: 4%;} table.diff th.diff_header {text-align: left;} table.diff tbody {border: none;} table.diff td {word-break: break-word; text-align: left;}</style>"
)
# Default list of repositories -- you could add to this
TIMEGATES = {
"nla": "https://web.archive.org.au/awa/",
"nlnz": "https://ndhadeliver.natlib.govt.nz/webarchive/wayback/",
"bl": "https://www.webarchive.org.uk/wayback/archive/",
"ia": "https://web.archive.org/web/",
"ukgwa": "https://webarchive.nationalarchives.gov.uk/ukgwa/"
}
# This deque will only hold a maximum of two values
# So we can just push new pages into it, and it will shove the old ones out the back.
html_data = deque("", 2)
def get_html(url):
response = requests.get(url)
# Sometimes the Mementos don't go to captures?!
# Eg https://web.archive.org.au/awa/20090912180610id_/http://www.discontents.com.au/
try:
re.search(r"/(\d{12}|\d{14})id_/", response.url).group(1)
except AttributeError:
return None
return {"url": response.url, "html": response.content}
def format_date(url):
timestamp = re.search(r"/(\d{12}|\d{14})id_/", url).group(1)
return arrow.get(timestamp, "YYYYMMDDHHmmss").format("D MMMM YYYY")
def convert_lists_to_dicts(results):
"""
Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.
Renames keys to standardise IA with other Timemaps.
"""
if results:
keys = results[0]
results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]
else:
results_as_dicts = results
# Rename keys
for d in results_as_dicts:
d["status"] = d.pop("statuscode")
d["mime"] = d.pop("mimetype")
d["url"] = d.pop("original")
return results_as_dicts
def get_capture_data_from_memento(url, request_type="head"):
"""
For OpenWayback systems this can get some extra capture info to insert in Timemaps.
"""
if request_type == "head":
response = requests.head(url)
else:
response = requests.get(url)
headers = response.headers
length = headers.get("x-archive-orig-content-length")
status = headers.get("x-archive-orig-status")
status = status.split(" ")[0] if status else None
mime = headers.get("x-archive-orig-content-type")
mime = mime.split(";")[0] if mime else None
return {"length": length, "status": status, "mime": mime}
def convert_link_to_json(results, enrich_data=False):
"""
Converts link formatted Timemap to JSON.
"""
data = []
for line in results.splitlines():
parts = line.split("; ")
if len(parts) > 1:
link_type = re.search(
r'rel="(original|self|timegate|first memento|last memento|memento)"',
parts[1],
).group(1)
if link_type == "memento":
link = parts[0].strip("<>")
timestamp, original = re.search(r"/(\d{12}|\d{14})/(.*)$", link).groups()
capture = {"timestamp": timestamp, "url": original}
if enrich_data:
capture.update(get_capture_data_from_memento(link))
data.append(capture)
return data
def get_timemap_as_json(timegate, url):
"""
Get a Timemap then normalise results (if necessary) to return a list of dicts.
"""
tg_url = f"{TIMEGATES[timegate]}timemap/json/{url}/"
response = requests.get(tg_url)
response_type = response.headers["content-type"]
# pywb style Timemap
if response_type == "text/x-ndjson":
data = [json.loads(line) for line in response.text.splitlines()]
# IA Wayback stype Timemap
elif response_type == "application/json":
data = convert_lists_to_dicts(response.json())
# Link style Timemap (OpenWayback)
elif response_type in ["application/link-format", "text/html;charset=utf-8"]:
data = convert_link_to_json(response.text)
return data
def process_text(html):
"""
Extract text from an HTML page and return it as a list of lines.
Removes blank lines.
"""
lines = [
line
for line in BeautifulSoup(html).get_text().splitlines()
if not re.match(r"^\s*$", line)
]
return lines
def format_date_link(url):
date = format_date(url)
return f'<a href="{url}">{date}</a>'
def show_line_differences(context=True, numlines=0):
"""
Use difflib to show a side-by-side comparison of the text in two web pages.
"""
differ = HtmlDiff()
doc1 = process_text(html_data[0]["html"])
doc2 = process_text(html_data[1]["html"])
date1 = format_date_link(html_data[0]["url"])
date2 = format_date_link(html_data[1]["url"])
html = differ.make_table(
doc1, doc2, context=context, numlines=numlines, fromdesc=date1, todesc=date2
)
# Rewrite the table html to make the column widths work better
html = html.replace(
r'<th colspan="2" class="diff_header"',
'<th class="diff_next"></th><th class="diff_header"',
)
# Cleaning up the table output
html = html.replace('nowrap="nowrap"', "")
html = html.replace("<tbody>", "").replace("</tbody>", "")
with out:
display(HTML(html))
def show_all_differences(timegate, url):
"""
Get all captures for a page from a Timemap, then compare each page with its predecessor,
display changes side-by-side.
"""
global html_data
timemap = get_timemap_as_json(timegate, url)
with out:
key = '<b>Key</b><ul><li><span class="diff_sub">deleted text</span></li><li><span class="diff_chg">changed text</span></li><li><span class="diff_add">added text</li></ul>'
display(HTML(key))
for i, capture in enumerate(timemap):
capture_url = f'{TIMEGATES[timegate]}{capture["timestamp"]}id_/{capture["url"]}'
if timegate == "nlnz" or (
capture["digest"] != timemap[i - 1]["digest"] and capture["status"] == "200"
):
capture_data = get_html(capture_url)
if capture_data:
html_data.append(capture_data)
if len(html_data) == 2:
# You could change the params below to show context around changes
# context=False -- shows the whole document
# numlines -- when context=True, the number of lines to show around the diff
# numline=0 -- just the diffs
show_line_differences(context=True, numlines=0)
def share_this():
binder_url = "https://mybinder.org/v2/gh/GLAM-Workbench/web-archives/master?urlpath=/voila/render/display-text-changes-from-timemap.ipynb"
parameter_string = quote(f"?url={target_url.value}&archive={repository.value}")
share_url = f"{binder_url}{parameter_string}"
with out:
display(HTML(f'<p>Share this: <a href="{share_url}">{share_url}</a></p>'))
def start(e):
clear("e")
show_all_differences(repository.value, target_url.value)
share_this()
def clear(e):
global html_data
html_data.clear()
out.clear_output()
query_string = os.environ.get("QUERY_STRING", "")
parameters = parse_qs(query_string)
url = parameters.get("url", [""])[0]
archive = parameters.get("archive", [""])[0]
out = widgets.Output()
repository = widgets.Dropdown(
options=[
("---", ""),
("UK Web Archive", "bl"),
("UK Government Web Archive", "ukgwa"),
("National Library of Australia", "nla"),
("National Library of New Zealand", "nlnz"),
("Internet Archive", "ia"),
],
description="Archive:",
disabled=False,
value=archive,
)
target_url = widgets.Text(description="URL:", value=url)
tc_button = widgets.Button(
description="Show text changes",
button_style="primary",
)
tc_button.on_click(start)
clear_button = widgets.Button(description="Clear all")
clear_button.on_click(clear)
display(widgets.HBox([repository, target_url], layout=widgets.Layout(padding="10px")))
display(widgets.HBox([tc_button, clear_button], layout=widgets.Layout(padding="10px")))
display(out)
%%capture
%load_ext dotenv
%dotenv
# Insert some values for automated testing
if os.getenv("GW_STATUS") == "dev":
url = "http://discontents.com.au/2017-the-making-and-the-talking/"
archive = "ia"
target_url.value = url
repository.value = archive
# If values have been provided via url or above, then start automatically.
# Note that Voila widgets don't load immediately, hence the polling to
# make sure the start button exists.
if url and archive:
script = """
<script type="text/javascript">
function start() {
if (document.querySelector("button")) {
let button = document.querySelector("button.mod-primary");
button.click();
} else {
setTimeout(start, 5);
}
}
start();
</script>"""
display(HTML(script))
Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!
Work on this notebook was supported by the IIPC Discretionary Funding Programme 2019-2020.
The Web Archives section of the GLAM Workbench is sponsored by the British Library.