#!/usr/bin/env python
# coding: utf-8

# # Create a flat list of organisations contributing metadata to Trove
# 
# The Trove API includes the `contributor` endpoint for retrieving information about organisations whose metadata is aggregated into Trove. If you include the `reclevel=full` parameter, you can get details of all contributors with a single API request like this:
# 
# ```
# https://api.trove.nla.gov.au/v2/contributor?encoding=json&reclevel=full&key=[YOUR API KEY]
# ```
# 
# However, the data can be difficult to use because of its nested structure, with some organisations having several levels of subsidiaries. There's also some inconsistency in the way nested records are named. This notebook aims to work around these problems by converting the nested data into a single flat list of organisations. 
# 
# This code is used to make weekly harvests of the contributor data which are saved [in this repository](https://github.com/wragge/trove-contributor-totals).

# ## Set things up

# In[170]:


import datetime
import json
import os
from pathlib import Path

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Create a session that will automatically retry on server errors
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))


# In[171]:


get_ipython().run_cell_magic('capture', '', '# Load variables from the .env file if it exists\n# Use %%capture to suppress messages\n%load_ext dotenv\n%dotenv\n')


# In[172]:


# Insert your Trove API key
API_KEY = "YOUR API KEY"

if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")


# ## Define some functions

# In[173]:


def get_contrib_details(record, parent=None):
    """
    Get the details of a contributor, recursing through children if present.
    """
    records = []
    # Get the basic details
    details = {
        "id": record["id"],
        "name": record["name"],
        "total_items": int(record["totalholdings"]),
        "parent": None,
    }
    # Add nuc if present
    if "nuc" in record:
        details["nuc"] = record["nuc"][0]
    else:
        details["nuc"] = None
    # If this is a child record, combine parent and child names
    if parent:
        if not record["name"].startswith(parent["name"]):
            details["name"] = f"{parent['name']} {record['name']}"
        # Add parent id
        details["parent"] = parent["id"]
    records = [details]
    if "children" in record:
        # Pass forward combined names for deeply nested orgs
        record["name"] = details["name"]
        records += get_children(record)
    return records


def get_children(parent):
    """
    Process child records.
    """
    children = []
    for child in parent["children"]["contributor"]:
        children += get_contrib_details(child, parent)
    return children


def get_contributors(save_json=True):
    """
    Get all Trove contributors, flattening the nested structure and optionally saving the original JSON.
    """
    contributors = []
    params = {"encoding": "json", "reclevel": "full", "key": API_KEY}
    response = s.get("https://api.trove.nla.gov.au/v2/contributor", params=params)
    data = response.json()
    # Save the original nested JSON response
    if save_json:
        Path(
            f"trove-contributors-{datetime.datetime.now().strftime('%Y%m%d')}.json"
        ).write_text(json.dumps(data))
    # Get details of each contributor
    for contrib in data["response"]["contributor"]:
        contributors += get_contrib_details(contrib)
    return contributors


# ## Get the data

# In[174]:


contributors = get_contributors()


# Convert the data to a dataframe.

# In[175]:


df = pd.DataFrame(contributors)
df.head()


# How many contributors are listed?

# In[178]:


df.shape[0]


# How many of the contributor records include NUCs?

# In[179]:


df.loc[df["nuc"].notnull()].shape[0]


# Save the data to a CSV file.

# In[177]:


df[["id", "nuc", "name", "parent", "total_items"]].to_csv(
    f"trove-contributors-{datetime.datetime.now().strftime('%Y%m%d')}.csv", index=False
)


# ----
# 
# Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge).