#!/usr/bin/env python
# coding: utf-8

# # Visualising open collections in DigitalNZ
# 
# DigitalNZ's `usage` facet lets you know what you can do with an item. The possible values are:
# 
# * Share
# * Modify
# * Use commercially
# * All rights reserved
# * Unknown
# 
# I've already harvested the [totals for each facet value](facets/usage.csv). You can find out more about [harvesting facet data in this notebook](harvest_facet_data.ipynb). Let's look at the totals.

# In[1]:


import pandas as pd
from pathlib import Path

df = pd.read_csv(Path('facets', 'usage.csv'))
df


# In my head, I need to try and translate these back into copyright and CC licence equivalents. So 'Modify' would be CC-BY-ND, and 'Share' would be CC-BY-NC-ND. According to [open licence definitions](https://opendefinition.org/licenses/), NC and ND licences are not regarded as 'open'. So to find 'open' items in DigitalNZ we have to look for items that have a `usage` value of 'Use commercially'. Of course, many of these will have no known copyright restrictions rather than a CC licence.
# 
# The 'Share', 'Modify', and 'Use commercially' values are not mututally exclusive. An item you can 'Use commercially', you can also 'Share'. This means items can have multiple values for `usage`. To find the total number of items with `usage` values we can try adding 'Share', 'All rights reserved', and 'Unknown' as there shouldn't be any overlap between these.

# In[2]:


df.loc[df['value'].isin(['Share', 'All rights reserved', 'Unknown'])]['count'].sum()


# So the percentage of open items in DigitalNZ is...

# In[3]:


print(f'{(22641258 / 31553183):.2%}')


# That's pretty good, though I suspect that this is dominated by out-of-copyright digitised newspaper articles from Papers Past.
# 
# In this notebook I'm going to attempt a more fine-grained view by identifying and visualising individual collections within DigitalNZ that have open content.

# ## Import what we need

# In[4]:


import requests
import requests_cache
from tqdm.auto import tqdm
import altair as alt

s = requests_cache.CachedSession()


# In[5]:


API_KEY = '[YOUR API KEY]'
API_KEY = '9yXNTynMDb3TUQws7QuD'
API_URL = 'http://api.digitalnz.org/v3/records.json'


# ## Define some functions

# In[6]:


def get_records(params):
    '''
    Get records from a search using the supplied parameters.
    '''
    response = s.get(API_URL, params=params)
    return response.json()

def harvest_facet_values(facet, **kwargs):
    '''
    Get all the values for the given facet.
    Apply filters in kwargs.
    '''
    facets = {}
    more = True
    page = 1
    params = {
        'api_key': API_KEY,
        'per_page': 0,
        'facets': facet,
        'facets_per_page': 350,
    }
    for k, v in kwargs.items():
        if k == 'text':
            params[k] = v
        else:
            params[f'and[{k}][]'] = v
    while more:
        params['facets_page'] = page
        data = get_records(params)
        if data['search']['facets'][facet]:
            facets.update(data['search']['facets'][facet])
            page += 1
        else:
            more = False
        return facets


# ## Assemble the data
# 
# In another notebook, I've already harvested `primary_collection` facets for each `content_partner`. Here we'll add in the `usage` data for each collection.

# In[7]:


# Open the collections by content partner data
df_collections = pd.read_csv(Path('facets', 'collections_by_partner.csv'))


# In[8]:


dfs_usage = []

# Loop through the collections by partner
for row in df_collections.itertuples():
    partner = row.content_partner
    collection = row.primary_collection
    
    # Get the usage facet data for the collection
    facets = harvest_facet_values('usage', content_partner=partner, primary_collection=collection)
    if facets:
        
        # Convert the facets dict to a dataframe
        df = pd.DataFrame.from_dict(facets, orient='index').reset_index()
        df.columns = ['usage', 'count']
        
        # Adding numbers with usage values of 'Share', 'All rights reserved', & 'Unknown' should give us a total number of items
        # Items that have usage values of 'Modify' & 'Use commerically' should be included in the 'Share' numbers
        df['usage_total'] = df.loc[df['usage'].isin(['Share', 'All rights reserved', 'Unknown'])]['count'].sum()
        
        # Add partner and collection names to the df
        df['content_partner'] = partner
        df['primary_collection'] = collection
        
        # The number of items from the primary_collection facet
        df['items_total'] = row.count
        
        # Add this df to a list of dfs
        dfs_usage.append(df)

# Merge the list of dfs into one big df
df_usage = pd.concat(dfs_usage)


# Now we have a dataframe combining all the partner, collection and usage data.

# In[9]:


# Reorder the columns
df_usage = df_usage[['content_partner', 'primary_collection', 'items_total', 'usage', 'count', 'usage_total']].sort_values(by=['content_partner', 'primary_collection'])
df_usage.head()


# Before we move on, I'm going to save this dataframe [as a CSV](facets/usage_by_collection_and_partner.csv) because it might be interesting to explore further.

# In[10]:


df_usage.to_csv(Path('facets', 'usage_by_collection_and_partner.csv'), index=False)


# You might notice that I have two 'total' fields in the dataframe. One is the number of items returned by the `primary_collection` facet. The other is calculated by adding up the number of items with `usage` values of 'Share', 'All rights reserved', and 'Unknown'. All items with a usage value of 'Modify' or 'Use commercially' should also have a value of 'Share', so I don't include them in the total. Assuming that all items have a `usage` value, these two totals should be the same. But are they?
# 
# The table below shows collections where the totals don't match. The good news is that most of them do, so my working assumptions seem pretty safe. But it seems that there are some items that don't have a `usage` value. In most cases the differences are small, but there are major disparities in the case of the Kura Heritage collections Online and the Turnbull Library's *Transactions & Proceedings of the Royal Society of NZ*. Below I'm going to use the total in order to calculate the proportion of items that are 'open'. So which value should I use? I'm going to go with the `usage_total` for now, because I know what that represents, but it would be easy to swap.

# In[11]:


df_usage.loc[df_usage['items_total'] != df_usage['usage_total']].drop_duplicates(subset=['content_partner', 'primary_collection', 'items_total', 'usage_total'])


# Now we'll calculate what proprtion of each collection is open, by dividing the number with a `usage` value of 'Use commercially', but the total number of items with `usage` values. 

# In[12]:


# Only use rows where usage value is 'Use commercially'
df_open = df_usage.loc[df_usage['usage'] == 'Use commercially'].copy()

# Calculate the proportion of items that are open by dividing those you can use commercially by the total
df_open['open'] = df_open.apply(lambda x: x['count'] / x['usage_total'], axis=1)

# Some collection names are the same, so we'll combine the partner & collection name to create a unique label for Altair
df_open['label'] = df_open.agg(lambda x: f'{x["primary_collection"]} ({x["content_partner"]})', axis=1)


# In[13]:


df_open.head()


# We're ready to make a chart!

# ## Visualise the results!
# 
# For something a bit different, and to celebrate the fact that GLAM organisations are making their collections openly available, I thought I'd attempt a fireworks theme for this visualisation. It's a bit of an experiment, but also a demonstration on how easy it is to play around with styles in Altair.

# In[14]:


# Use the dark theme
alt.themes.enable('dark')

base = alt.Chart(df_open).encode(
    # The collection/partner label is used on the X axis
    x=alt.X('label:N', title='Collection'),
    # The proportion that is open is on the Y axis (formatted as a percentage)
    y=alt.Y('open:Q', axis=alt.Axis(format='%', grid=False), title='Percent open'),
    # Colour is determined by the content_partner value
    color=alt.Color('content_partner:N', legend=None),
    tooltip=[alt.Tooltip('content_partner', title='Partner'), alt.Tooltip('primary_collection', title='Collection'), alt.Tooltip('usage_total', title='Total items', format=','), alt.Tooltip('open', format='.2%', title='Percent open')]
)

# The bursts of fireworks (the stroke settings make the lines radiating out from the circles)
# Size is calculated using a log scale because Papers Past is so much bigger than anything else
circles = base.mark_circle(opacity=0.8, strokeOpacity=0.2,strokeDash=[3,1], strokeWidth=20, stroke='white').encode(
    size=alt.Size('usage_total:Q', scale=alt.Scale(type='log', range=[1, 6000]), title='Total items'),
)

# Use a bar chart to make the lines connecting the explosions to the baseline
lines = base.mark_bar(size=2, opacity=0.3, strokeDash=[2,2], strokeWidth=1, stroke='white', strokeOpacity=0.2).encode()

# Set various style options
combined = alt.layer(lines, circles).properties(
    height=500, width=2500, title='Collections in DigitalNZ with open content (commercial use allowed)'
).configure_axis(
    grid=False,
    labelColor='#999999',
    domainColor='#999999',
    tickColor='#999999',
    titleColor='#999999'
).configure_view(
    strokeWidth=0
).configure_title(
    color='#999999'
).configure_legend(
    symbolBaseFillColor='#666666',
    symbolStrokeWidth=0,
    titleColor='#999999',
    labelColor='#999999'
)

combined


# Save the visualisation as an HTML page. You can [see the result here](https://glam-workbench.net/digitalnz-views/open_collections_digitalnz.html).

# In[15]:


combined.save('open_collections_digitalnz.html')


# ----
# 
# Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).