#!/usr/bin/env python
# coding: utf-8

# # Get an random newspaper article from Trove
# 
# Changes to the Trove API mean that the techniques I've previously used to select resources at random [will no longer work](https://updates.timsherratt.org/2019/10/09/creators-and-users.html). This notebook provides one alternative.
# 
# I wanted something that would work efficiently, but would also expose as much of the content as possible. Applying multiple facets together with a randomly-generated query seems to do a good job of getting the result set below 100 (the maximum available from a single API call). This should mean that *most* of the newspaper articles are reachable, but it's a bit hard to quantify.
# 
# Thanks to Mitchell Harrop for [suggesting I could use randomly selected stopwords](https://twitter.com/mharropesquire/status/1182175315860213760) as queries. I've supplemented the stopwords with letters and digits, and together they seem to do a good job of applying an initial filter and mixing up the relevance ranking.
# 
# As you can see from the examples below, you can supply any of the facets available in the newspapers zone – for example: `state`, `title`, `year`, `illType`, `category`.

# In[1]:


import json
import os
import random

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

with open("stopwords.json", "r") as json_file:
    STOPWORDS = json.load(json_file)

API_URL = "http://api.trove.nla.gov.au/v2/result"


# In[2]:


get_ipython().run_cell_magic('capture', '', '# Load variables from the .env file if it exists\n# Use %%capture to suppress messages\n%load_ext dotenv\n%dotenv\n')


# In[3]:


# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")


# In[4]:


def get_random_facet_value(params, facet):
    """
    Get values for the supplied facet and choose one at random.
    """
    these_params = params.copy()
    these_params["facet"] = facet
    response = s.get(API_URL, params=these_params)
    data = response.json()
    try:
        values = [
            t["search"] for t in data["response"]["zone"][0]["facets"]["facet"]["term"]
        ]
    except TypeError:
        return None
    return random.choice(values)


def get_total_results(params):
    response = s.get(API_URL, params=params)
    data = response.json()
    total = int(data["response"]["zone"][0]["records"]["total"])
    return total


def get_random_article(query=None, **kwargs):
    """
    Get a random article.
    The kwargs can be any of the available facets, such as 'state', 'title', 'illtype', 'year'.
    """
    total = 0
    applied_facets = []
    facets = ["month", "year", "decade", "word", "illustrated", "category", "title"]
    tries = 0
    params = {
        "zone": "newspaper",
        "encoding": "json",
        # Note that keeping n at 0 until we've filtered the result set speeds things up considerably
        "n": "0",
        # Uncomment these if you need more than the basic data
        # "reclevel": "full",
        # 'include': 'articleText',
        "key": API_KEY,
    }
    if query:
        params["q"] = query
    # If there's no query supplied then use a random stopword to mix up the results
    else:
        random_word = random.choice(STOPWORDS)
        params["q"] = f'"{random_word}"'
    # Apply any supplied factes
    for key, value in kwargs.items():
        params[f"l-{key}"] = value
        applied_facets.append(key)
    # Remove any facets that have already been applied from the list of available facets
    facets[:] = [f for f in facets if f not in applied_facets]
    total = get_total_results(params)
    # If our randomly selected stopword has produced no results
    # keep trying with new queries until we get some (give up after 10 tries)
    while total == 0 and tries <= 10:
        if not query:
            random_word = random.choice(STOPWORDS)
            params["q"] = f'"{random_word}"'
        tries += 1
    # Apply facets one at a time until we have less than 100 results, or we run out of facets
    while total > 100 and len(facets) > 0:
        # Get the next facet
        facet = facets.pop()
        # Set the facet to a randomly selected value
        params[f"l-{facet}"] = get_random_facet_value(params, facet)
        total = get_total_results(params)
        # print(total)
        # print(response.url)
    # If we've ended up with some results, then select one (of the first 100) at random
    if total > 0:
        params["n"] = "100"
        response = s.get(API_URL, params=params)
        data = response.json()
        article = random.choice(data["response"]["zone"][0]["records"]["article"])
        return article


# ## Get any old article...

# In[5]:


get_random_article()


# ## Get a random article about pademelons

# In[6]:


get_random_article(query="pademelon")


# ## Get a random article from Tasmania

# In[7]:


get_random_article(state="Tasmania")


# ## Get a random article from the _Sydney Morning Herald_

# In[8]:


get_random_article(title="35", category="Article")


# ## Get a random illustrated article

# In[9]:


get_random_article(illustrated="true")


# ## Get a random illustrated advertisement from the _Australian Womens Weekly_

# In[10]:


get_random_article(title="112", illustrated="true", category="Advertising")


# ## Get a random cartoon

# In[11]:


get_random_article(illtype="Cartoon")


# ## Get a random article from 1930

# In[12]:


get_random_article(year="1930")


# ## Get a random article tagged 'poem'

# In[13]:


get_random_article(publictag="poem")


# ## Speed test

# In[14]:


get_ipython().run_cell_magic('timeit', '', 'get_random_article()\n')


# ----
# 
# Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/).
#