#!/usr/bin/env python # coding: utf-8 # # Get full page screenshots from archived web pages # # [View in GitHub](https://github.com/GLAM-Workbench/web-archives/blob/master/save_screenshot.ipynb) · [View in GLAM Workbench](https://glam-workbench.net/web-archives/#create-and-compare-full-page-screenshots-from-archived-web-pages) # In[ ]: # This notebook is designed to run in Voila as an app (with the code hidden). # To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab' # Your browser might ask for permission to open the new tab as a popup. # In[ ]: get_ipython().run_cell_magic('capture', '', 'import base64\nimport io\nimport math\nimport os\nimport re\nimport time\nfrom pathlib import Path\nfrom urllib.parse import urlparse\n\nimport arrow\nimport geckodriver_autoinstaller\nimport ipywidgets as widgets\nimport PIL\nimport requests\nimport selenium\nfrom IPython.display import HTML, display\nfrom PIL import Image\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom slugify import slugify\n\ngeckodriver_autoinstaller.install()\n') # In[ ]: TIMEGATES = { "nla": "https://web.archive.org.au/awa/", "nlnz": "https://ndhadeliver.natlib.govt.nz/webarchive/", "bl": "https://www.webarchive.org.uk/wayback/archive/", "ia": "https://web.archive.org/web/", "ukgwa": "https://webarchive.nationalarchives.gov.uk/ukgwa/", } wayback = ["web.archive.org"] pywb = { "web.archive.org.au": "replayFrame", "webarchive.nla.gov.au": "replayFrame", "webarchive.org.uk": "replay_iframe", "ndhadeliver.natlib.govt.nz": "replayFrame", "webarchive.nationalarchives.gov.uk": "replay_iframe", } html_output = [] def format_date_for_headers(iso_date, tz): """ Convert an ISO date (YYYY-MM-DD) to a datetime at noon in the specified timezone. Convert the datetime to UTC and format as required by Accet-Datetime headers: eg Fri, 23 Mar 2007 01:00:00 GMT """ local = arrow.get(f"{iso_date} 12:00:00 {tz}", "YYYY-MM-DD HH:mm:ss ZZZ") gmt = local.to("utc") return f'{gmt.format("ddd, DD MMM YYYY HH:mm:ss")} GMT' def format_date_from_timestamp(url): timestamp = re.search(r"/(\d{14}|\d{12})(?:if_|mp_)*/", url).group(1) return arrow.get(timestamp, "YYYYMMDDHHmmss").format("D MMMM YYYY") def parse_links_from_headers(response): """ Extract original, timegate, timemap, and memento links from 'Link' header. """ links = response.links return {k: v["url"] for k, v in links.items()} def query_timegate(timegate, url, date=None, tz="Australia/Canberra"): headers = {} if date: formatted_date = format_date_for_headers(date, tz) headers["Accept-Datetime"] = formatted_date # BL, NLNZ & UKGWA don't seem to default to latest date if no date supplied elif not date and timegate in ["bl", "nlnz", "ukgwa"]: formatted_date = format_date_for_headers( arrow.utcnow().format("YYYY-MM-DD"), tz ) headers["Accept-Datetime"] = formatted_date # Note that you don't get a timegate response if you leave off the trailing slash, but extras don't hurt! tg_url = ( f"{TIMEGATES[timegate]}{url}/" if not url.endswith("/") else f"{TIMEGATES[timegate]}{url}" ) # print(tg_url) # IA doesn't work with head, others don't work with get... if timegate == "ia": response = requests.get(tg_url, headers=headers) else: response = requests.head(tg_url, headers=headers) return parse_links_from_headers(response) def get_memento(timegate, url, date): links = query_timegate(timegate, url, date) # NLNZ doesn't always seem to return a Memento, so we'll build in some fuzziness if links: if "memento" in links: memento = links["memento"] elif "prev memento" in links: memento = links["prev memento"] elif "next memento" in links: memento = links["next memento"] elif "last memento" in links: memento = links["last memento"] else: memento = None return memento def get_full_page_screenshot(url, save_width=200): """ Gets a full page screenshot of the supplied url. By default resizes the screenshot to a maximum width of 200px. Provide a 'save_width' value to change this. NOTE the webdriver sometimes fails for unknown reasons. Just try again. """ global html_output domain = urlparse(url)[1].replace("www.", "") # NZ and IA inject content into the page, so we use if_ to get the original page (with rewritten urls) if domain in wayback and "if_" not in url: url = re.sub(r"/(\d{14}|\d{12})/http", r"/\1if_/http", url) try: date_str, site = re.search( r"/(\d{14}|\d{12})(?:if_|mp_)*/https*://?(.+/)", url ).groups() except AttributeError: # There's something wrong with the link... # print(url) show_error(f"{url} isn't a Memento – did you forget to select an archive?") else: output_dir = Path("screenshots") output_dir.mkdir(parents=True, exist_ok=True) ss_file = Path(output_dir, f"{slugify(site)}-{date_str}-{save_width}.png") options = webdriver.FirefoxOptions() options.headless = True driver = webdriver.Firefox(options=options) driver.implicitly_wait(15) driver.get(url) # Give some time for everything to load time.sleep(30) driver.maximize_window() # UK and AU use pywb in framed replay mode, so we need to switch to the framed content if domain in pywb: try: driver.switch_to.frame(pywb[domain]) except selenium.common.exceptions.NoSuchFrameException: # If we pass here we'll probably still get a ss, just not full page -- better than failing? pass ss = None for tag in ["body", "html", "frameset"]: try: elem = driver.find_element(By.TAG_NAME, tag) ss = elem.screenshot_as_base64 break except ( selenium.common.exceptions.NoSuchElementException, selenium.common.exceptions.WebDriverException, ): pass driver.quit() if not ss: show_error(f"Couldn't get a screenshot of {url} – sorry...") else: img = Image.open(io.BytesIO(base64.b64decode(ss))) ratio = save_width / img.width (width, height) = (save_width, math.ceil(img.height * ratio)) resized_img = img.resize((width, height), PIL.Image.Resampling.LANCZOS) resized_img.save(ss_file) return ss_file def display_screenshot(ss_file, url): date = format_date_from_timestamp(url) try: display_url = re.search(r"/(\d{14}|\d{12})(?:mp_|if_|id_)*/(.*)$", url).group(1) except AttributeError: display_url = url status.clear_output() html_output.append( f'

{date}
{display_url}


[Download]

' ) with out: display((HTML("".join(html_output)))) def show_error(message=None): status.clear_output() with status: print(f"Something went wrong – {message}") def start(e): status.clear_output() out.clear_output(wait=True) with status: print("Generating screenshot...") if repository.value: memento = get_memento(repository.value, target_url.value, target_date.value) else: memento = target_url.value if memento: try: ss_file = get_full_page_screenshot(memento, save_width=width.value) if ss_file: display_screenshot(ss_file, memento) except selenium.common.exceptions.WebDriverException: show_error(f"couldn't get a screenshot of {memento} – sorry...") else: show_error("couldn't find a Memento – sorry...") def clear(e): global html_output html_output = [] status.clear_output() out.clear_output() def clear_last(e): global html_output html_output.pop() out.clear_output(wait=True) with out: display((HTML("".join(html_output)))) repository = widgets.Dropdown( options=[ ("---", ""), ("UK Web Archive", "bl"), ("UK Government Web Archive", "ukgwa"), ("National Library of Australia", "nla"), ("National Library of New Zealand", "nlnz"), ("Internet Archive", "ia"), ], description="Archive:", disabled=False, ) target_url = widgets.Text(description="Target URL:") target_date = widgets.DatePicker(description="Target date: ", disabled=False) width = widgets.IntSlider( value=7, min=200, max=1000, step=100, description="Width:", disabled=False, continuous_update=False, orientation="horizontal", readout=True, readout_format="d", ) out = widgets.Output() status = widgets.Output() ss_button = widgets.Button(description="Get screenshot", button_style="primary") ss_button.on_click(start) clear_button = widgets.Button(description="Clear all") clear_button.on_click(clear) clear_last_button = widgets.Button(description="Clear last") clear_last_button.on_click(clear_last) note = """ """ display( HTML(note), widgets.HBox( [widgets.VBox([repository, target_date]), widgets.VBox([target_url, width])], layout=widgets.Layout(padding="20px"), ), widgets.HBox([ss_button, clear_button, clear_last_button]), status, out, ) # In[ ]: get_ipython().run_cell_magic('capture', '', '%load_ext dotenv\n%dotenv\n') # In[ ]: # Insert some values for automated testing if os.getenv("GW_STATUS") == "dev": repository.value = "nlnz" target_url.value = "http://digitalnz.org" target_date.value = arrow.get("2015-01-01").date() # In[ ]: # If values have been provided via url or above, then start automatically. # Note that Voila widgets don't load immediately, hence the polling to # make sure the start button exists. if target_url.value: script = """ """ display(HTML(script)) # ---- # Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io). Support me by becoming a [GitHub sponsor](https://github.com/sponsors/wragge)! # # Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/). # # The Web Archives section of the GLAM Workbench is sponsored by the [British Library](https://www.bl.uk/).