#!/usr/bin/env python
# coding: utf-8

# # Harvest ABC Radio National records from Trove
# 
# Trove harvests details of programs and segments broadcast on ABC Radio National. You can find them by [searching](https://trove.nla.gov.au/search/category/music?keyword=nuc%3A%22ABC%3ARN%22) for `nuc:"ABC:RN"` in the Music & Audio category. The records include basic metadata such as titles, dates, and contributors, but not full transcripts or audio.
# 
# This notebook harvests metadata describing ABCRN programs and segments using the Trove API. Note that there don't seem to have been any additions to the data since early 2022.
# 
# As of December 2023, there are **427,141** records (after removing duplicates) from about **163 programs** (the actual number of programs is less than this, as the names used for some programs varies). See [this notebook](explore-abcrn-data.ipynb) for some examples of how you can start exploring the data.
# 
# The harvested data is available in this GitHub repository. You can download the full dataset as a **340mb [NDJSON file](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-metadata.ndjson)** (with a separate JSON object for each record, separated by line breaks) and as a **216mb [CSV file](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-metadata.csv)** (with lists saved as pipe-separated strings).
# 
# For convenience, I've also created separate CSV files for the programs with the most records:
# 
# * [RN Breakfast](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-breakfast-metadata.csv)
# * [RN Drive](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-drive-metadata.csv)
# * [AM](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-am-metadata.csv)
# * [PM](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-pm-metadata.csv)
# * [The World Today](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-worldtoday-metadata.csv)
# * [Late Night Live](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-latenight-metadata.csv)
# * [Life Matters](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-lifematters-metadata.csv)
# * [The Science Show](https://github.com/GLAM-Workbench/trove-abcrn-data/blob/main/abcrn-scienceshow-metadata.csv)
# 
# There's also a [harvest from 2016](https://github.com/wragge/radio-national-data) available in this repository.
# 
# ## Data fields
# 
# Any of the fields other than `work_id` and `version_id` might be empty, though in most cases there should at least be values for `title`, `date`, `creator`, `contributor` and `isPartOf`.
# 
# * `work_id` – identifier for the containing work in Trove (you can use this to create a url to the item)
# * `version_id` – an identifier for the version within the work
# * `title` – title for the program or segment
# * `isPartOf` – name of the program this is a part of
# * `date` – ISO formatted date
# * `creator` – usually just the ABC
# * `contributor` – a list of names of those involved, such as the host, reporter or guest
# * `type` – list of types
# * `format` – list of formats
# * `abstract` – text providing a summary of the program or segment (may incude multiple values)
# * `fulltext_url` – link to the page on the ABC website where you can find more information
# * `thumbnail_url` – link to a related thumbnail image on the ABC website
# * `notonline_url` – not sure...

# ## Import what we need

# In[1]:


import json
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import requests_cache
from dotenv import load_dotenv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

load_dotenv()


# In[2]:


# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")


# ## Define some functions

# In[5]:


def get_total(params):
    params["n"] = 0
    response = s.get("https://api.trove.nla.gov.au/v3/result", params=params)
    data = response.json()
    return int(data["category"][0]["records"]["total"])


def get_metadata_source(record):
    try:
        source = record["metadataSource"]["value"]
    except TypeError:
        source = record["metadataSource"]
    return source


def extract_values(value, key="value"):
    """
    Some fields mix dicts and lists. Try to extract values from dicts and return only lists.
    """
    values = []
    value_list = [v for v in value if v]
    for v in value_list:
        try:
            values.append(v[key].strip())
        except (TypeError, KeyError):
            values.append(v.strip())
    return values


def get_links(identifiers):
    """
    Flatten the identifiers list of dicts into a dict with linktype as key.
    """
    links = {}
    for link in identifiers:
        try:
            links[f'{link["linktype"]}_url'] = link["value"]
        except (TypeError, KeyError):
            pass
    return links


def harvest(output_file=None, year=None):
    Path("data").mkdir(exist_ok=True)
    if not output_file:
        output_file = f'abcrn-{datetime.now().strftime("%Y%m%d")}.ndjson'
    output_file = Path("data", output_file)
    params = {
        "q": 'nuc:"ABC:RN"',
        "category": "music",
        "include": "workversions",
        "n": 100,
        "bulkHarvest": "true",
        "encoding": "json",
        "key": API_KEY,
    }
    if year:
        params["l-year"] = year
        params["l-decade"] = year[:3]
    start = "*"
    total = get_total(params.copy())

    with output_file.open("w") as data_file:
        with tqdm(total=total) as pbar:
            while start:
                params["s"] = start
                response = s.get(
                    "https://api.trove.nla.gov.au/v3/result", params=params
                )
                data = response.json()
                # Loop through the work records
                records = data["category"][0]["records"]["work"]
                for record in records:
                    # Now loop through the version records
                    for version in record["version"]:
                        # Sometimes versions can themselves contain multiple records and ids
                        # First we'll try splitting the ids in case there are multiple values
                        ids = version["id"].split()
                        # Then we'll try looping through any sub-version records
                        for i, subr in enumerate(version["record"]):
                            # Get the metadata source so we can filter out any records we don't want
                            subv = subr["metadata"]["dc"]
                            source = get_metadata_source(subr)
                            if source == "ABC:RN":
                                # Add work id to the record
                                metadata = {
                                    "work_id": record["id"],
                                    "version_id": ids[i],
                                    "title": extract_values(subv["title"]),
                                    "date": extract_values(subv["date"]),
                                    "isPartOf": extract_values(subv["isPartOf"]),
                                    "creator": extract_values(
                                        subv["creator"], key="name"
                                    ),
                                    "contributor": extract_values(subv["contributor"]),
                                    "abstract": extract_values(subv["abstract"]),
                                    "type": extract_values(subv["type"]),
                                    "format": extract_values(subv["format"]),
                                }
                                # Get links by flattening the identifiers field and add to record
                                links = get_links(subv["identifier"])
                                metadata.update(links)
                                # remove unnecessary identifiers field
                                data_file.write(f"{json.dumps(metadata)}\n")
                try:
                    start = data["category"][0]["records"]["nextStart"]
                except KeyError:
                    start = None
                pbar.update(len(records))


# ## Harvest the data!

# In[6]:


output_file = f'abcrn-{datetime.now().strftime("%Y%m%d")}.ndjson'

harvest(output_file=output_file)


# ## Remove duplicate records
# 
# How many records have we harvested? Let's load the `ndjson` file into a dataframe and explore.

# In[7]:


# The lines param tells pandas there's one JSON object per line.
df = pd.read_json(Path("data", output_file), lines=True)
df.head()


# In[8]:


df.shape


# Most of the fields contains lists of values (though there may only be one value in the list). To check for duplicates we need to convert these lists into strings. If there are multiple values, they'll be separated by a pipe (`|`) character.

# In[9]:


cols_with_lists = [
    "title",
    "isPartOf",
    "date",
    "creator",
    "contributor",
    "type",
    "format",
    "abstract",
]
for col in cols_with_lists:
    df[col] = df[col].str.join("|")


# You'd expect the combination of `title`, `date`, and program (in the `isPartOf` field) to be unique – let's see.

# In[10]:


df.loc[df.duplicated(subset=("title", "date", "isPartOf"))].shape


# There seem to be quite a lot of duplicates! Let's remove the duplicates based on the `title`, `date`, and `isPartOf` fields. By adding `fulltext_url` to the sort, I'm hoping to drop the duplicates without urls (by default `drop_duplicates` keeps the first version of a duplicated record).

# In[11]:


df = df.sort_values(by=["title", "date", "fulltext_url"]).drop_duplicates(
    subset=["title", "date", "isPartOf"]
)


# Now how many do we have?

# In[12]:


df.shape


# ##  Save as CSV

# In[13]:


csv_file = Path("data", f'abcrn-{datetime.now().strftime("%Y%m%d")}.csv')
df.to_csv(csv_file, index=False)


# ## Create CSV downloads for individual programs
# 
# Here's the programs with the most records. Note that some like RN Breakfast are split between two `isPartOf` values, 'ABC Radio National. RN Breakfast' and 'ABC Radio. RN Breakfast'.

# In[14]:


df["isPartOf"].value_counts()[:20]


# Let's save the programs with the most records as separate CSV files to make them a bit easier to work with. We'll also group together programs with multiple `isPartOf` values.

# In[15]:


programs = {
    "breakfast": ["ABC Radio National. RN Breakfast", "ABC Radio. RN Breakfast"],
    "am": ["ABC Radio. AM", "ABC Radio. AM Archive"],
    "pm": ["ABC Radio. PM", "ABC Radio. PM Archive"],
    "world_today": ["ABC Radio. The World Today", "ABC Radio. The World Today Archive"],
    "drive": ["ABC Radio. RN Drive", "ABC Radio National. RN Drive"],
    "latenight": ["ABC Radio National. Late Night Live"],
    "lifematters": ["ABC Radio National. Life Matters"],
    "scienceshow": ["ABC Radio National. The Science Show"],
}

for program, labels in programs.items():
    dfp = df.loc[df["isPartOf"].isin(labels)].sort_values(by=["date", "title"])
    csv_file = Path("data", f'{program}-{datetime.now().strftime("%Y%m%d")}.csv')
    dfp.to_csv(csv_file, index=False)


# In[16]:


# This is just a small automated test used in development
# You can ignore this cell
if os.getenv("GW_STATUS") == "dev":
    output_file = f'abcrn-test-{datetime.now().strftime("%Y%m%d")}.ndjson'
    output_path = Path("data", output_file)
    harvest(output_file=output_file, year="2022")
    df = pd.read_json(output_path, lines=True)
    assert df.empty is False
    output_path.unlink()


# ----
# 
# Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io/)