#!/usr/bin/env python # coding: utf-8 # # Getting all available snapshots of a particular page from the Internet Archive – Timemap or CDX? # #

New to Jupyter notebooks? Try Using Jupyter notebooks for a quick introduction.

# # There are a couple of ways of getting a list of the available snapshots for a particular url. In this notebook, we'll compare the Internet Archive's CDX index API, with their Memento Timemap API. Do they give us the same data? # # See [Exploring the Internet Archive's CDX API](exploring_cdx_api.ipynb) for more information about the CDX API. # In[1]: import requests import pandas as pd # ## Get the data for comparison # In[2]: def query_timemap(url): ''' Get a Timemap in JSON format for the specified url. ''' response = requests.get(f'https://web.archive.org/web/timemap/json/{url}', headers={'User-Agent': ''}) response.raise_for_status() return response.json() # In[3]: def query_cdx(url, **kwargs): ''' Query the IA CDX API for the supplied url. You can optionally provide any of the parameters accepted by the API. ''' params = kwargs params['url'] = url params['output'] = 'json' # User-Agent value is necessary or else IA gives an error response = requests.get('http://web.archive.org/cdx/search/cdx', params=params, headers={'User-Agent': ''}) response.raise_for_status() return response.json() # In[4]: url = 'http://nla.gov.au' tm_data = query_timemap(url) tm_df = pd.DataFrame(tm_data[1:], columns=tm_data[0]) cdx_data = query_cdx(url) cdx_df = pd.DataFrame(cdx_data[1:], columns=cdx_data[0]) # ## Are the columns the same? # In[5]: list(cdx_df.columns) # In[6]: list(tm_df.columns) # The Timemap data includes three extra columns: `robotflags`, `offset`, and `filename`. The `offset` and `filename` columns tell you where to find the snapshot, but I'm not sure what `robotflags` is for (it's not in the [specification](http://iipc.github.io/warc-specifications/specifications/cdx-format/cdx-2015/)). Let's gave a look at what sort of values it has. # In[7]: tm_df['robotflags'].value_counts() # There's nothing in it – at least for this particular url. # # For my purposes, it doesn't look like the Timemap adds anything useful. # ## Do they provide the same number of snapshots? # In[8]: tm_df.shape # In[9]: cdx_df.shape # So there are more snapshots in the CDX results than the Timemap. Can we find out what they are? # In[10]: # Combine the two dataframes, then only keep rows that aren't duplicated based on timestamp, original, digest, and statuscode pd.concat([cdx_df,tm_df]).drop_duplicates(subset=['timestamp', 'original', 'digest', 'statuscode'], keep=False) # Hmm, if there were rows in the `cdx_df` that weren't in the `tm_df` I'd expect them to show up, but there are **no** rows that aren't duplicated based on the `timestamp`, `original`, `digest`, and `statuscode` columns... # # Let's try this another way, by finding the number of unique shapshots in each df. # In[11]: # Remove duplicate rows cdx_df.drop_duplicates(subset=['timestamp', 'digest', 'statuscode', 'original'], keep='first').shape # In[12]: # Remove duplicate rows tm_df.drop_duplicates(subset=['timestamp', 'digest', 'statuscode', 'original'], keep='first').shape # Ah, so both sets of data contain duplicates, and there are really only **2,788 unique shapshots**. Let's look at some of the duplicates in the CDX data. # In[13]: dupes = cdx_df.loc[cdx_df.duplicated(subset=['timestamp', 'digest'], keep=False)].sort_values(by='timestamp') dupes.head(10) # In[14]: print(f'Date range of duplicates: {dupes["timestamp"].min()} to {dupes["timestamp"].max()}') # So it seems they provide the same number of unique snapshots, but the CDX index adds a few more duplicates. # ## Is there a difference in speed? # In[15]: get_ipython().run_cell_magic('timeit', '', 'tm_data = query_timemap(url)\n') # In[16]: get_ipython().run_cell_magic('timeit', '', 'cdx_data = query_cdx(url)\n') # ## Conclusion # Both methods provide much the same data, so it just comes down to convenience and performance. # ---- # Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io). # # Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/) # #