#!/usr/bin/env python
# coding: utf-8

# # Get full page screenshots from archived web pages
# 
# [View in GitHub](https://github.com/GLAM-Workbench/web-archives/blob/master/save_screenshot.ipynb) &middot; [View in GLAM Workbench](https://glam-workbench.net/web-archives/#create-and-compare-full-page-screenshots-from-archived-web-pages)

# In[ ]:


# This notebook is designed to run in Voila as an app (with the code hidden).
# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'
# Your browser might ask for permission to open the new tab as a popup.


# In[ ]:


get_ipython().run_cell_magic('capture', '', 'import base64\nimport io\nimport math\nimport os\nimport re\nimport time\nfrom pathlib import Path\nfrom urllib.parse import urlparse\n\nimport arrow\nimport geckodriver_autoinstaller\nimport ipywidgets as widgets\nimport PIL\nimport requests\nimport selenium\nfrom IPython.display import HTML, display\nfrom PIL import Image\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom slugify import slugify\n\ngeckodriver_autoinstaller.install()\n')


# In[ ]:


TIMEGATES = {
    "nla": "https://web.archive.org.au/awa/",
    "nlnz": "https://ndhadeliver.natlib.govt.nz/webarchive/",
    "bl": "https://www.webarchive.org.uk/wayback/archive/",
    "ia": "https://web.archive.org/web/",
    "ukgwa": "https://webarchive.nationalarchives.gov.uk/ukgwa/",
}

wayback = ["web.archive.org"]
pywb = {
    "web.archive.org.au": "replayFrame",
    "webarchive.nla.gov.au": "replayFrame",
    "webarchive.org.uk": "replay_iframe",
    "ndhadeliver.natlib.govt.nz": "replayFrame",
    "webarchive.nationalarchives.gov.uk": "replay_iframe",
}

html_output = []


def format_date_for_headers(iso_date, tz):
    """
    Convert an ISO date (YYYY-MM-DD) to a datetime at noon in the specified timezone.
    Convert the datetime to UTC and format as required by Accet-Datetime headers:
    eg Fri, 23 Mar 2007 01:00:00 GMT
    """
    local = arrow.get(f"{iso_date} 12:00:00 {tz}", "YYYY-MM-DD HH:mm:ss ZZZ")
    gmt = local.to("utc")
    return f'{gmt.format("ddd, DD MMM YYYY HH:mm:ss")} GMT'


def format_date_from_timestamp(url):
    timestamp = re.search(r"/(\d{14}|\d{12})(?:if_|mp_)*/", url).group(1)
    return arrow.get(timestamp, "YYYYMMDDHHmmss").format("D MMMM YYYY")


def parse_links_from_headers(response):
    """
    Extract original, timegate, timemap, and memento links from 'Link' header.
    """
    links = response.links
    return {k: v["url"] for k, v in links.items()}


def query_timegate(timegate, url, date=None, tz="Australia/Canberra"):
    headers = {}
    if date:
        formatted_date = format_date_for_headers(date, tz)
        headers["Accept-Datetime"] = formatted_date
    # BL, NLNZ & UKGWA don't seem to default to latest date if no date supplied
    elif not date and timegate in ["bl", "nlnz", "ukgwa"]:
        formatted_date = format_date_for_headers(
            arrow.utcnow().format("YYYY-MM-DD"), tz
        )
        headers["Accept-Datetime"] = formatted_date
    # Note that you don't get a timegate response if you leave off the trailing slash, but extras don't hurt!
    tg_url = (
        f"{TIMEGATES[timegate]}{url}/"
        if not url.endswith("/")
        else f"{TIMEGATES[timegate]}{url}"
    )
    # print(tg_url)
    # IA doesn't work with head, others don't work with get...
    if timegate == "ia":
        response = requests.get(tg_url, headers=headers)
    else:
        response = requests.head(tg_url, headers=headers)
    return parse_links_from_headers(response)


def get_memento(timegate, url, date):
    links = query_timegate(timegate, url, date)
    # NLNZ doesn't always seem to return a Memento, so we'll build in some fuzziness
    if links:
        if "memento" in links:
            memento = links["memento"]
        elif "prev memento" in links:
            memento = links["prev memento"]
        elif "next memento" in links:
            memento = links["next memento"]
        elif "last memento" in links:
            memento = links["last memento"]
    else:
        memento = None
    return memento


def get_full_page_screenshot(url, save_width=200):
    """
    Gets a full page screenshot of the supplied url.
    By default resizes the screenshot to a maximum width of 200px.
    Provide a 'save_width' value to change this.

    NOTE the webdriver sometimes fails for unknown reasons. Just try again.
    """
    global html_output
    domain = urlparse(url)[1].replace("www.", "")
    # NZ and IA inject content into the page, so we use if_ to get the original page (with rewritten urls)
    if domain in wayback and "if_" not in url:
        url = re.sub(r"/(\d{14}|\d{12})/http", r"/\1if_/http", url)
    try:
        date_str, site = re.search(
            r"/(\d{14}|\d{12})(?:if_|mp_)*/https*://?(.+/)", url
        ).groups()
    except AttributeError:
        # There's something wrong with the link...
        # print(url)
        show_error(f"{url} isn't a Memento – did you forget to select an archive?")
    else:
        output_dir = Path("screenshots")
        output_dir.mkdir(parents=True, exist_ok=True)
        ss_file = Path(output_dir, f"{slugify(site)}-{date_str}-{save_width}.png")
        options = webdriver.FirefoxOptions()
        options.headless = True
        driver = webdriver.Firefox(options=options)
        driver.implicitly_wait(15)
        driver.get(url)
        # Give some time for everything to load
        time.sleep(30)
        driver.maximize_window()
        # UK and AU use pywb in framed replay mode, so we need to switch to the framed content
        if domain in pywb:
            try:
                driver.switch_to.frame(pywb[domain])
            except selenium.common.exceptions.NoSuchFrameException:
                # If we pass here we'll probably still get a ss, just not full page -- better than failing?
                pass
        ss = None
        for tag in ["body", "html", "frameset"]:
            try:
                elem = driver.find_element(By.TAG_NAME, tag)
                ss = elem.screenshot_as_base64
                break
            except (
                selenium.common.exceptions.NoSuchElementException,
                selenium.common.exceptions.WebDriverException,
            ):
                pass
        driver.quit()
        if not ss:
            show_error(f"Couldn't get a screenshot of {url} – sorry...")
        else:
            img = Image.open(io.BytesIO(base64.b64decode(ss)))
            ratio = save_width / img.width
            (width, height) = (save_width, math.ceil(img.height * ratio))
            resized_img = img.resize((width, height), PIL.Image.Resampling.LANCZOS)
            resized_img.save(ss_file)
            return ss_file


def display_screenshot(ss_file, url):
    date = format_date_from_timestamp(url)
    try:
        display_url = re.search(r"/(\d{14}|\d{12})(?:mp_|if_|id_)*/(.*)$", url).group(1)
    except AttributeError:
        display_url = url
    status.clear_output()
    html_output.append(
        f'<div style="float:left; margin-left: 20px;"><p><b>{date}</b><br><a href="{url.replace("if_/", "/")}">{display_url}</a></p><p><a href="{ss_file}"><img src="{ss_file}"></a><br><a href="{ss_file}">[Download]</a></p></div>'
    )
    with out:
        display((HTML("".join(html_output))))


def show_error(message=None):
    status.clear_output()
    with status:
        print(f"Something went wrong – {message}")


def start(e):
    status.clear_output()
    out.clear_output(wait=True)
    with status:
        print("Generating screenshot...")
    if repository.value:
        memento = get_memento(repository.value, target_url.value, target_date.value)
    else:
        memento = target_url.value
    if memento:
        try:
            ss_file = get_full_page_screenshot(memento, save_width=width.value)
            if ss_file:
                display_screenshot(ss_file, memento)
        except selenium.common.exceptions.WebDriverException:
            show_error(f"couldn't get a screenshot of {memento} – sorry...")
    else:
        show_error("couldn't find a Memento – sorry...")


def clear(e):
    global html_output
    html_output = []
    status.clear_output()
    out.clear_output()


def clear_last(e):
    global html_output
    html_output.pop()
    out.clear_output(wait=True)
    with out:
        display((HTML("".join(html_output))))


repository = widgets.Dropdown(
    options=[
        ("---", ""),
        ("UK Web Archive", "bl"),
        ("UK Government Web Archive", "ukgwa"),
        ("National Library of Australia", "nla"),
        ("National Library of New Zealand", "nlnz"),
        ("Internet Archive", "ia"),
    ],
    description="Archive:",
    disabled=False,
)

target_url = widgets.Text(description="Target URL:")

target_date = widgets.DatePicker(description="Target date: ", disabled=False)

width = widgets.IntSlider(
    value=7,
    min=200,
    max=1000,
    step=100,
    description="Width:",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)

out = widgets.Output()
status = widgets.Output()
ss_button = widgets.Button(description="Get screenshot", button_style="primary")
ss_button.on_click(start)
clear_button = widgets.Button(description="Clear all")
clear_button.on_click(clear)
clear_last_button = widgets.Button(description="Clear last")
clear_last_button.on_click(clear_last)
note = """
    <ul class="browser-default">
    <li>Select a repository, and insert a url to generate a screenshot from the archive.</li>
    <li>If you include a date, it\'ll attempt to find the closest capture using Memento Timegates.</li>
    <li>If you don't include a date, it'll give you the most recent capture.</li>
    <li>If you already have the url of the exact capture you want, just put it in the 'Target url' box and leave 'Archive' and 'Target date' blank.
    <li>You can add multiple screenshots to compare changes.</li>
    </ul>
    """
display(
    HTML(note),
    widgets.HBox(
        [widgets.VBox([repository, target_date]), widgets.VBox([target_url, width])],
        layout=widgets.Layout(padding="20px"),
    ),
    widgets.HBox([ss_button, clear_button, clear_last_button]),
    status,
    out,
)


# In[ ]:


get_ipython().run_cell_magic('capture', '', '%load_ext dotenv\n%dotenv\n')


# In[ ]:


# Insert some values for automated testing

if os.getenv("GW_STATUS") == "dev":
    repository.value = "nlnz"
    target_url.value = "http://digitalnz.org"
    target_date.value = arrow.get("2015-01-01").date()


# In[ ]:


# If values have been provided via url or above, then start automatically.
# Note that Voila widgets don't load immediately, hence the polling to
# make sure the start button exists.

if target_url.value:
    script = """
    <script type="text/javascript">
        function start() {
          if (document.querySelector("button")) {
            let button = document.querySelector("button.mod-primary");
            button.click();
          } else {
            setTimeout(start, 5);
          }
        }
    start();
    </script>"""
    display(HTML(script))


# ----
# Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io). Support me by becoming a [GitHub sponsor](https://github.com/sponsors/wragge)!
# 
# Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/).
# 
# The Web Archives section of the GLAM Workbench is sponsored by the [British Library](https://www.bl.uk/).