#!/usr/bin/env python # coding: utf-8 # # Get an random newspaper article from Trove # # Changes to the Trove API mean that the techniques I've previously used to select resources at random [will no longer work](https://updates.timsherratt.org/2019/10/09/creators-and-users.html). This notebook provides one alternative. # # I wanted something that would work efficiently, but would also expose as much of the content as possible. Applying multiple facets together with a randomly-generated query seems to do a good job of getting the result set below 100 (the maximum available from a single API call). This should mean that *most* of the newspaper articles are reachable, but it's a bit hard to quantify. # # Thanks to Mitchell Harrop for [suggesting I could use randomly selected stopwords](https://twitter.com/mharropesquire/status/1182175315860213760) as queries. I've supplemented the stopwords with letters and digits, and together they seem to do a good job of applying an initial filter and mixing up the relevance ranking. # # As you can see from the examples below, you can supply any of the facets available in the newspapers zone – for example: `state`, `title`, `year`, `illType`, `category`. # In[1]: import json import os import random import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry s = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("http://", HTTPAdapter(max_retries=retries)) with open("stopwords.json", "r") as json_file: STOPWORDS = json.load(json_file) API_URL = "http://api.trove.nla.gov.au/v2/result" # In[2]: get_ipython().run_cell_magic('capture', '', '# Load variables from the .env file if it exists\n# Use %%capture to suppress messages\n%load_ext dotenv\n%dotenv\n') # In[3]: # Insert your Trove API key API_KEY = "YOUR API KEY" # Use api key value from environment variables if it is available if os.getenv("TROVE_API_KEY"): API_KEY = os.getenv("TROVE_API_KEY") # In[4]: def get_random_facet_value(params, facet): """ Get values for the supplied facet and choose one at random. """ these_params = params.copy() these_params["facet"] = facet response = s.get(API_URL, params=these_params) data = response.json() try: values = [ t["search"] for t in data["response"]["zone"][0]["facets"]["facet"]["term"] ] except TypeError: return None return random.choice(values) def get_total_results(params): response = s.get(API_URL, params=params) data = response.json() total = int(data["response"]["zone"][0]["records"]["total"]) return total def get_random_article(query=None, **kwargs): """ Get a random article. The kwargs can be any of the available facets, such as 'state', 'title', 'illtype', 'year'. """ total = 0 applied_facets = [] facets = ["month", "year", "decade", "word", "illustrated", "category", "title"] tries = 0 params = { "zone": "newspaper", "encoding": "json", # Note that keeping n at 0 until we've filtered the result set speeds things up considerably "n": "0", # Uncomment these if you need more than the basic data # "reclevel": "full", # 'include': 'articleText', "key": API_KEY, } if query: params["q"] = query # If there's no query supplied then use a random stopword to mix up the results else: random_word = random.choice(STOPWORDS) params["q"] = f'"{random_word}"' # Apply any supplied factes for key, value in kwargs.items(): params[f"l-{key}"] = value applied_facets.append(key) # Remove any facets that have already been applied from the list of available facets facets[:] = [f for f in facets if f not in applied_facets] total = get_total_results(params) # If our randomly selected stopword has produced no results # keep trying with new queries until we get some (give up after 10 tries) while total == 0 and tries <= 10: if not query: random_word = random.choice(STOPWORDS) params["q"] = f'"{random_word}"' tries += 1 # Apply facets one at a time until we have less than 100 results, or we run out of facets while total > 100 and len(facets) > 0: # Get the next facet facet = facets.pop() # Set the facet to a randomly selected value params[f"l-{facet}"] = get_random_facet_value(params, facet) total = get_total_results(params) # print(total) # print(response.url) # If we've ended up with some results, then select one (of the first 100) at random if total > 0: params["n"] = "100" response = s.get(API_URL, params=params) data = response.json() article = random.choice(data["response"]["zone"][0]["records"]["article"]) return article # ## Get any old article... # In[5]: get_random_article() # ## Get a random article about pademelons # In[6]: get_random_article(query="pademelon") # ## Get a random article from Tasmania # In[7]: get_random_article(state="Tasmania") # ## Get a random article from the _Sydney Morning Herald_ # In[8]: get_random_article(title="35", category="Article") # ## Get a random illustrated article # In[9]: get_random_article(illustrated="true") # ## Get a random illustrated advertisement from the _Australian Womens Weekly_ # In[10]: get_random_article(title="112", illustrated="true", category="Advertising") # ## Get a random cartoon # In[11]: get_random_article(illtype="Cartoon") # ## Get a random article from 1930 # In[12]: get_random_article(year="1930") # ## Get a random article tagged 'poem' # In[13]: get_random_article(publictag="poem") # ## Speed test # In[14]: get_ipython().run_cell_magic('timeit', '', 'get_random_article()\n') # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). #