Find when a piece of text appears in an archived web page

This notebook helps you find when a particular piece of text appears in, or disappears from, a web page. Using Memento Timemaps, it gets a list of available captures from the selected web archive. It then searches each capture for the desired text, displaying the results.

You can select the direction in which the notebook searches:

  • First occurrence – find the first capture in which the text appears (start from the first capture and come forward in time)
  • Last occurrence – find the last capture in which the text appears (start from present and go backwards in time)
  • All occurrences – find all matches (start from the first capture and continue until the last)

If you select 'All occurrences' the notebook will generate a simple chart showing how the number of matches changes over time.

By default, the notebook displays possible or 'fuzzy' matches as well as exact matches, but these are not counted in the totals.

In [ ]:
import requests
from IPython.display import display, HTML
import re
import arrow
from bs4 import BeautifulSoup, Tag
import ipywidgets as widgets
import json
import time
from fuzzysearch import find_near_matches
import altair as alt
import pandas as pd

# This is to restyle the standard html table output from difflib
HTML('<style>.x-match {background-color: #ccffcc;} .p-match {background-color: #ffffcc;}</style>')
In [ ]:
%%javascript
// This is necessary in Jupyter notebook to stop the output area folding up
// Will give an error in Jupyter Lab
IPython.OutputArea.prototype._should_scroll = function(lines) {return false}
In [ ]:
# Default list of repositories -- you could add to this
TIMEGATES = {
    'nla': 'https://web.archive.org.au/awa/',
    'nlnz': 'https://ndhadeliver.natlib.govt.nz/webarchive/wayback/',
    'bl': 'https://www.webarchive.org.uk/wayback/archive/',
    'ia': 'https://web.archive.org/web/'
}
In [ ]:
def get_html(url):
    '''
    Get html from a capture url.
    '''
    response = requests.get(url)
    # Sometimes the Mementos don't go to captures?!
    # Eg https://web.archive.org.au/awa/20090912180610id_/http://www.discontents.com.au/
    try:
        timestamp = re.search(r'/(\d{14})id_/', response.url).group(1)
    except AttributeError:
        return None
    return {'url': response.url, 'html': response.content}

def format_date(url):
    '''
    Extract timestamp from url and format in a human readable way.
    '''
    timestamp = re.search(r'/(\d{14})id_/', url).group(1)
    return arrow.get(timestamp, 'YYYYMMDDHHmmss').format('D MMMM YYYY')

def format_date_as_iso(url):
    '''
    Extract timestamp from url and format as ISO.
    '''
    timestamp = re.search(r'/(\d{14})id_/', url).group(1)
    return arrow.get(timestamp, 'YYYYMMDDHHmmss').format('YYYY-MM-DD')

def convert_lists_to_dicts(results):
    '''
    Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.
    Renames keys to standardise IA with other Timemaps.
    '''
    if results:
        keys = results[0]
        results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]
    else:
        results_as_dicts = results
    # Rename keys
    for d in results_as_dicts:
        d['status'] = d.pop('statuscode')
        d['mime'] = d.pop('mimetype')
        d['url'] = d.pop('original')
    return results_as_dicts

def get_capture_data_from_memento(url, request_type='head'):
    '''
    For OpenWayback systems this can get some extra cpature info to insert in Timemaps.
    '''
    if request_type == 'head':
        response = requests.head(url)
    else:
        response = requests.get(url)
    headers = response.headers
    length = headers.get('x-archive-orig-content-length')
    status = headers.get('x-archive-orig-status')
    status = status.split(' ')[0] if status else None
    mime = headers.get('x-archive-orig-content-type')
    mime = mime.split(';')[0] if mime else None
    return {'length': length, 'status': status, 'mime': mime}

def convert_link_to_json(results, enrich_data=False):
    '''
    Converts link formatted Timemap to JSON.
    '''
    data = []
    for line in results.splitlines():
        parts = line.split('; ')
        if len(parts) > 1:
            link_type = re.search(r'rel="(original|self|timegate|first memento|last memento|memento)"', parts[1]).group(1)
            if link_type == 'memento':
                link = parts[0].strip('<>')
                timestamp, original = re.search(r'/(\d{14})/(.*)$', link).groups()
                capture = {'timestamp': timestamp, 'url': original}
                if enrich_data:
                    capture.update(get_capture_data_from_memento(link))
                data.append(capture)
    return data
                
def get_timemap_as_json(timegate, url):
    '''
    Get a Timemap then normalise results (if necessary) to return a list of dicts.
    '''
    tg_url = f'{TIMEGATES[timegate]}timemap/json/{url}/'
    response = requests.get(tg_url)
    response_type = response.headers['content-type']
    # pywb style Timemap
    if response_type == 'text/x-ndjson':
        data = [json.loads(line) for line in response.text.splitlines()]
    # IA Wayback stype Timemap
    elif response_type == 'application/json':
        data = convert_lists_to_dicts(response.json())
    # Link style Timemap (OpenWayback)
    elif response_type in ['application/link-format', 'text/html;charset=utf-8']:
        data = convert_link_to_json(response.text)
    return data

def display_chart(matches):
    '''
    Visualise matches over time.
    '''
    df = pd.DataFrame(matches)
    chart = alt.Chart(df).mark_line(point=True).encode(
        x = 'date:T',
        y = 'matches:Q',
        tooltip = ['date:T', 'matches:Q']
    )
    with chart_display:
        display(chart)

def process_text(html):
    '''
    Extract text from an HTML page and return it as a list of lines.
    Removes blank lines.
    '''
    lines = [l for l in BeautifulSoup(html).get_text().splitlines() if not re.match(r'^\s*$', l)]
    return lines

def format_date_link(url):
    '''
    Extract date from url, format, and display as link.
    '''
    date = format_date(url)
    return f'<a href="{url.replace("id_", "")}">{date}</a>'

def format_context(text, match):
    '''
    Extract, markup, and format context around a match.
    '''
    style = 'p-match' if match.dist > 0 else 'x-match'
    marked_up = f'{text[:match.start]}<span class="{style}">{text[match.start:match.end]}</span>{text[match.end:]}'
    result_string = marked_up[max(0, match.start - 40):match.end + 40 + 22 + 7]
    result_string = result_string[result_string.index(' '):result_string.rindex(' ')].strip()
    return f'...{result_string}...'

def search_page(capture_data, pattern):
    '''
    Search for a text string in the html of a page.
    '''
    found = 0
    text = BeautifulSoup(capture_data['html']).get_text()
    date = format_date_link(capture_data['url'])
    matches = find_near_matches(pattern.casefold(), text.casefold(), max_l_dist=1)
    if matches:
        results = f'<h4><a href="{capture_data["url"]}">{date}</a></h4><ul>'
        for match in matches:
            results += f'<li>\'{format_context(text, match)}\'</li>'
            if match.dist == 0:
                found += 1
        results += '</ul>'
        with out:
            display(HTML(results))
    return found

def update_status(i, total_matches):
    '''
    Display numbers of documents processed and matches found.
    '''
    with status:
        status.clear_output(wait=True)
        display(HTML(f'Captures processed: {i + 1}'))
        display(HTML(f'Exact matches found: {total_matches}'))

def find_text(timegate, url, pattern, direction):
    '''
    Get all captures for a page from a Timemap, then search for requested text in each page,
    aggregating the results.
    '''
    total_matches = 0
    matches = []
    with out:
        key = '<b>Key</b><ul><li><span class="x-match">exact match</li><li><span class="p-match">possible match</span></li></ul>'
        display(HTML(key))
    timemap = get_timemap_as_json(timegate, url)
    if direction == 'last':
        timemap.reverse()
    for i, capture in enumerate(timemap):
        capture_url = f'{TIMEGATES[timegate]}{capture["timestamp"]}id_/{capture["url"]}'
        if timegate == 'nlnz' or (capture['digest'] != timemap[i-1]['digest'] and capture['status'] == '200'):
            capture_data = get_html(capture_url)
            if capture_data:
                found = search_page(capture_data, pattern)
                total_matches += found
                if found > 0:
                    matches.append({'date': format_date_as_iso(capture_url), 'matches': found})
                    if direction in ['first', 'last']:
                        break
        update_status(i, total_matches)
    if direction in ['first', 'last']:
        update_status(i, total_matches)
    else:
        display_chart(matches)

def start(e):
    clear('e')
    find_text(repository.value, target_url.value, search_string.value, search_direction.value)
    
def clear(e):
    status.clear_output()
    chart_display.clear_output()
    out.clear_output()
    
out = widgets.Output()
status = widgets.Output()
chart_display = widgets.Output()

repository = widgets.Dropdown(
    options=[('---', ''), ('UK Web Archive', 'bl'), ('National Library of Australia', 'nla'), ('National Library of New Zealand', 'nlnz'), ('Internet Archive', 'ia')],
    description='Archive:',
    disabled=False,
    value=''
)

search_direction = widgets.Dropdown(
    options=[('First occurrence', 'first'), ('Last occurrence', 'last'), ('All occurrences', 'all')],
    description='Find:',
    disabled=False,
    value='first'
)

target_url = widgets.Text(description='URL:')

search_string = widgets.Text(description='Search text:')

tc_button = widgets.Button(description='Find text', button_style='primary')
tc_button.on_click(start)
clear_button = widgets.Button(description='Clear all')
clear_button.on_click(clear)

display(widgets.HBox([repository, target_url], layout=widgets.Layout(padding='10px')))
display(widgets.HBox([search_string, search_direction], layout=widgets.Layout(padding='10px')))
display(widgets.HBox([tc_button, clear_button], layout=widgets.Layout(padding='10px')))
display(status)
display(chart_display)
display(out)

Created by Tim Sherratt for the GLAM Workbench.

Work on this notebook was supported by the IIPC Discretionary Funding Programme 2019-2020