#!/usr/bin/env python
# coding: utf-8

# # Open Access versions of articles in Australian HASS journals
# 
# Previously I [attempted some analysis](finding-oa-versions-of-AHS-articles.ipynb) of the open access status of research articles published in _Australian Historical Studies_. I thought it would be interesting to try some comparisons with other Australian HASS subscription-based journals.
# 
# I've simplified the process here to make it easier run an analysis of any journal. The steps are:
# 
# 1. Get a list of articles published in the journal by querying the [CrossRef API](https://www.crossref.org/education/retrieve-metadata/rest-api/) with the journal's ISSN.
# 2. Remove recurring sections such as 'Editorial' and 'Book reviews' from the list of articles.
# 3. Look up the OA status of each remaining article by querying the [Unpaywall API](https://unpaywall.org/products/api) with the article's DOI.
# 
# I then do some simple analysis of the OA status, and visualise the results over time.
# 
# ## Understanding the OA status
# 
# Theh Unpaywall API returns one of five values for the OA status of an article – 'Gold', 'Hybrid', 'Green', 'Bronze', and 'Closed'. There's some [more information on how these are determined](https://support.unpaywall.org/support/solutions/articles/44001777288) on the Unpaywall site. Put simply:
# 
# * **Gold** – the article is freely available, openly licensed, and published in an open access journal
# * **Hybrid** – the article is freely available, openly licensed, and published in a subscription journal
# * **Green** – a version of the article (usually the Author's Accepted Manuscript) is freely available from a public repository
# * **Bronze** – the article is published in a subscription journal, but is freely available from the journal's website 
# * **Closed** – the article is behind a paywall
# 
# ## Caveats
# 
# * The data might not be up-to-date. In particular, I've noticed that some 'bronze' status articles are reported as 'closed'. Presumably this is because the Unpaywall database is running a bit behind changes in the publishers' websites.
# * The definition of an 'article' is not consistent. In earlier issues of some journals it seems that things like book reviews are grouped together under a single DOI, while recent issues have a DOI for each review. 
# 
# ## Journals
# 
# So far I've looked at the following journals (more suggestions welcome):
# 
# * [Australian Historical Studies](#Australian-Historical-Studies)
# * [History Australia](#History-Australia)
# * [Australian Journal of Politics and History](#Australian-Journal-of-Politics-and-History)
# * [Journal of Australian Studies](#Journal-of-Australian-Studies)
# * [Australian Archaeology](#Australian-Archaeology)
# * [Archives and Manuscripts](#Archives-and-Manuscripts)
# * [Journal of the Australian Library and Information Association](#Journal-of-the-Australian-Library-and-Information-Association)
# * [Labour History](#Labour-History)
# 
# Of course, this analysis is focused on subscription journals. There are also open access journals like the [Public History Review](https://epress.lib.uts.edu.au/journals/index.php/phrj) where all the articles would be 'Gold'!
# 
# ## Results (12 January 2021)
# 
# The results are not good. Articles published in Australia's main subscription history journals are about **94% closed**. This is despite the fact that Green OA policies allow authors to deposit versions of their articles in public repositories (often after an embargo period). 
# 
# | Journal | Closed | 
# |----|----|
# |Australian Historical Studies|94.6%|
# |History Australia|94.9%|
# |Australian Journal of Politics and History|95.7%*|
# |Journal of Australian Studies|94.2%|
# |Australian Archaeology|83.4%|
# |Archives and Manuscripts (2012-)|24.8%|
# |Journal of the Australian Library and Information Association|52.5%*|
# |Labour History|93.9%|
# |* Problems with data noted below|
# 
# **This can be fixed!** If you're in a university, talk to your librarians about depositing a Green OA version of your article in an institutional repository. If not, you can use the [Share your paper](https://shareyourpaper.org/) service to upload a Green OA version to Zenodo. Your research will be easier to find, easier to access, easier to use, and available to everyone – not just those with the luxury of an institutional subscription.
# 

# ## Import what we need

# In[34]:


import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests_cache
from tqdm.auto import tqdm
import pandas as pd
import altair as alt
import collections

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

tqdm.pandas(desc="records")


# In[35]:


# the APIs are open, but it's polite to let the APIs know who you are
email = 'tim@discontents.com.au'


# ## Define some functions to do the work

# In[36]:


def get_total_results(issn):
    '''
    Get the total number of articles in CrossRef for this journal.
    '''
    response = s.get(f'https://api.crossref.org/journals/{issn}/works/', params={'rows': 0})
    data = response.json()
    try:
        total_works = data['message']['total-results']
    except KeyError:
        total_works = 0
    return total_works

def get_title(record):
    '''
    Titles are in a list – join any values
    '''
    title = record.get('title')
    if isinstance(title, list):
        title = ' – '.join(title)
    return title

def harvest_works(issn):
    '''
    Harvest basic details (DOI, title, date) of articles from the journal with the supplied ISSN from CrossRef.
    '''
    harvested = 0
    works = []
    total_results = get_total_results(issn)
    params = {
        'rows': 100,
        'offset': 0
    }
    headers = {
        'User-Agent': f'Jupyter Notebook (mailto:{email})'
    }
    with tqdm(total=total_results) as pbar:
        while harvested <= total_results:
            params['offset'] = harvested
            response = s.get(f'https://api.crossref.org/journals/{issn}/works/', params=params, headers=headers)
            data = response.json()
            try:
                records = data['message']['items']
            except TypeError:
                print('TYPEERROR')
                print(data)
            else:
                for record in records:
                    try:
                        works.append({'doi': record.get('DOI'), 'title': get_title(record), 'year': record['issued']['date-parts'][0][0]})
                    except KeyError:
                        print('KEYERROR')
                        print(record)
            harvested += 100
            pbar.update(len(data['message']['items']))
    return works

def get_oa_status(doi):
    '''
    Get OA status of DOI from the Unpaywall API.
    '''
    response = s.get(f'https://api.unpaywall.org/v2/{doi}?email={email}')
    data = response.json()
    return data['oa_status']

def create_scale(df):
    '''
    Set colour range to match the OA status types.
    '''
    scale = []
    colours = collections.OrderedDict()
    colours['hybrid'] = 'gold'
    colours['green'] = 'green'
    colours['bronze'] = 'brown'
    colours['closed'] = 'lightgrey'
    status_values = list(df['oa_status'].unique())
    for status, colour in colours.items():
        if status in status_values:
            scale.append(colour)
    return scale

def chart_oa_status(df, title):
    # Adding a numeric order column makes it easy to sort by oa_status
    df['order'] = df['oa_status'].replace({val: i for i, val in enumerate(['closed', 'bronze', 'green', 'hybrid'])})
    # Get colour values
    scale = create_scale(df)
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X('year:O', title='Year'),
        y=alt.Y('count():Q', title='Number of articles', axis=alt.Axis(tickMinStep=1)),
        color=alt.Color('oa_status:N', scale=alt.Scale(range=scale), legend=alt.Legend(title='OA type'), sort=alt.EncodingSortField('order', order='descending')),
        order='order',
        tooltip=[alt.Tooltip('count():Q', title='Number of articles'), alt.Tooltip('oa_status', title='OA type')]
    ).properties(title=title)
    display(chart)


# ## Australian Historical Studies
# 
# * ISSN: 1031-461X
# * [Website](https://www.tandfonline.com/toc/rahs20/current)

# In[37]:


works_ahs = harvest_works('1031-461X')


# In[38]:


df_ahs = pd.DataFrame(works_ahs)
df_ahs.shape


# In[39]:


# Make sure there's no duplicates
df_ahs.drop_duplicates(inplace=True)
df_ahs.shape


# In[40]:


# Show repeated titles
df_ahs['title'].value_counts()[:25]


# In[41]:


# Get rid of titles that appear more than once
df_ahs_unique = df_ahs.copy().drop_duplicates(subset='title', keep=False)
df_ahs_unique.shape


# In[42]:


df_ahs_unique['oa_status']  = df_ahs_unique['doi'].progress_apply(get_oa_status)


# ### Results

# In[43]:


df_ahs_unique['oa_status'].value_counts()


# In[44]:


df_ahs_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'


# In[45]:


chart_oa_status(df_ahs_unique, title='Australian Historical Studies')


# ## History Australia
# 
# *  ISSN: 1449-0854
# * [Website](https://www.tandfonline.com/toc/raha20/current)
# * [Archived issues in Trove](https://webarchive.nla.gov.au/tep/46522)

# In[46]:


works_ha = harvest_works('1449-0854')


# In[47]:


df_ha = pd.DataFrame(works_ha)
df_ha.shape


# In[48]:


df_ha.drop_duplicates(inplace=True)
df_ha.shape


# In[49]:


df_ha.loc[df_ha['title'].isnull()]


# In[50]:


df_ha.dropna(subset=['title'], inplace=True)
df_ha.shape


# In[51]:


df_ha['title'].value_counts()[:30]


# In[52]:


df_ha_unique = df_ha.copy().drop_duplicates(subset='title', keep=False)
df_ha_unique.shape


# In[53]:


df_ha_unique['oa_status']  = df_ha_unique['doi'].progress_apply(get_oa_status)


# ### Results

# In[54]:


df_ha_unique['oa_status'].value_counts()


# In[55]:


df_ha_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'


# In[56]:


chart_oa_status(df_ha_unique, title='History Australia')


# ## Australian Journal of Politics and History
# 
# * ISSN: 1467-8497
# * [Website](https://onlinelibrary.wiley.com/journal/14678497)
# 
# There's clearly some problems with dates in the CrossRef data.

# In[57]:


works_ajph = harvest_works('1467-8497')


# In[58]:


df_ajph = pd.DataFrame(works_ajph)
df_ajph.shape


# In[59]:


df_ajph.drop_duplicates(inplace=True)
df_ajph.shape


# In[60]:


df_ajph.loc[df_ajph['title'].isnull()]


# In[61]:


df_ajph.dropna(subset=['title'], inplace=True)
df_ajph.shape


# In[62]:


df_ajph['title'].value_counts()[:40]


# In[63]:


df_ajph_unique = df_ajph.copy().drop_duplicates(subset='title', keep=False)
df_ajph_unique.shape


# In[64]:


df_ajph_unique['oa_status']  = df_ajph_unique['doi'].progress_apply(get_oa_status)


# ## Results

# In[65]:


df_ajph_unique['oa_status'].value_counts()


# In[66]:


df_ajph_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'


# In[67]:


chart_oa_status(df_ajph_unique, title='Australian Journal of Politics and History')


# ## Journal of Australian Studies
# 
# * ISSN: 1444-3058
# * [Website](https://www.tandfonline.com/toc/rjau20/current)

# In[68]:


works_jas = harvest_works('1444-3058')


# In[69]:


df_jas = pd.DataFrame(works_jas)
df_jas.shape


# In[70]:


df_jas.drop_duplicates(inplace=True)
df_jas.shape


# In[71]:


df_jas.loc[df_jas['title'].isnull()]


# In[72]:


df_jas.dropna(subset=['title'], inplace=True)
df_jas.shape


# In[73]:


df_jas['title'].value_counts()[:30]


# In[74]:


df_jas_unique = df_jas.copy().drop_duplicates(subset='title', keep=False)
df_jas_unique.shape


# In[75]:


df_jas_unique['oa_status']  = df_jas_unique['doi'].progress_apply(get_oa_status)


# ### Results

# In[76]:


df_jas_unique['oa_status'].value_counts()


# In[77]:


df_jas_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'


# In[78]:


chart_oa_status(df_jas_unique, title='Journal of Australian Studies')


# ## Australian Archaeology
# 
# * ISSN: 0312-2417
# * [Website](https://www.tandfonline.com/toc/raaa20/current)

# In[79]:


works_aa = harvest_works('0312-2417')


# In[80]:


df_aa = pd.DataFrame(works_aa)
df_aa.shape


# In[81]:


df_aa.drop_duplicates(inplace=True)
df_aa.shape


# In[82]:


df_aa.loc[df_aa['title'].isnull()]


# In[83]:


df_aa.dropna(subset=['title'], inplace=True)
df_aa.shape


# In[84]:


df_aa['title'].value_counts()[:30]


# In[85]:


df_aa_unique = df_aa.copy().drop_duplicates(subset='title', keep=False)
df_aa_unique.shape


# In[86]:


df_aa_unique['oa_status']  = df_aa_unique['doi'].progress_apply(get_oa_status)


# ### Results

# In[87]:


df_aa_unique['oa_status'].value_counts()


# In[88]:


df_aa_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'


# In[89]:


chart_oa_status(df_aa_unique, title='Australian Archaeology')


# ## Archives and Manuscripts
# 
# * ISSN: 0157-6895
# * [Website](https://www.tandfonline.com/toc/raam20/current)
# 
# Note that articles published before 2012 are available through an [open access repository](https://publications.archivists.org.au/index.php/asa).

# In[90]:


works_am = harvest_works('0157-6895')


# In[91]:


df_am = pd.DataFrame(works_am)
df_am.shape


# In[92]:


df_am.drop_duplicates(inplace=True)
df_am.shape


# In[93]:


df_am.loc[df_am['title'].isnull()]


# In[94]:


df_am.dropna(subset=['title'], inplace=True)
df_am.shape


# In[95]:


df_am['title'].value_counts()[:30]


# In[96]:


df_am_unique = df_am.copy().drop_duplicates(subset='title', keep=False)
df_am_unique.shape


# In[97]:


df_am_unique['oa_status']  = df_am_unique['doi'].progress_apply(get_oa_status)


# ### Results

# In[98]:


df_am_unique['oa_status'].value_counts()


# In[99]:


df_am_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'


# In[100]:


chart_oa_status(df_am_unique, title='Archives and Manuscripts')


# ## Journal of the Australian Library and Information Association
# 
# * ISSN: 2475-0158
# * [Website](https://www.tandfonline.com/toc/ualj21/current)
# 
# Previously _Australian Academic and Research Libraries_ 
# 
# * ISSN: 0004-8623
# * [Website](https://www.tandfonline.com/toc/uarl20/current)
# * [Archived issues available in Trove](https://webarchive.nla.gov.au/awa/20130209041025/http://pandora.nla.gov.au/pan/128690/20130208-0850/www.alia.org.au/publishing/aarl/index.html)
# 
# Note that most of AARL seems to be 'bronze', but is not being accurately reported by the Unpaywall API.

# In[101]:


works_aarn = harvest_works('0004-8623')


# In[102]:


works_jalia = harvest_works('2475-0158')


# In[103]:


df_jalia = pd.concat([pd.DataFrame(works_aarn), pd.DataFrame(works_jalia)]) 
df_jalia.shape


# In[104]:


df_jalia.drop_duplicates(inplace=True)
df_jalia.shape


# In[105]:


df_jalia.loc[df_jalia['title'].isnull()]


# In[106]:


df_jalia.dropna(subset=['title'], inplace=True)
df_jalia.shape


# In[107]:


df_jalia['title'].value_counts()[:40]


# In[108]:


df_jalia_unique = df_jalia.copy().drop_duplicates(subset='title', keep=False)
df_jalia_unique.shape


# In[109]:


df_jalia_unique['oa_status']  = df_jalia_unique['doi'].progress_apply(get_oa_status)


# ### Results

# In[110]:


df_jalia_unique['oa_status'].value_counts()


# In[111]:


df_jalia_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'


# In[112]:


chart_oa_status(df_jalia_unique, title='Journal of the Australian Library and Information Association')


# ## Labour History
# 
# * ISSN: 0023-6942
# * [Website](https://www.liverpooluniversitypress.co.uk/journals/id/55https://www.liverpooluniversitypress.co.uk/journals/id/55)

# In[113]:


works_lh = harvest_works('0023-6942')


# In[114]:


df_lh = pd.DataFrame(works_lh)
df_lh.shape


# In[115]:


df_lh.drop_duplicates(inplace=True)
df_lh.shape


# In[116]:


df_lh.loc[df_lh['title'].isnull()]


# In[117]:


df_lh.dropna(subset=['title'], inplace=True)
df_lh.shape


# In[118]:


df_lh['title'].value_counts()[:30]


# In[119]:


df_lh_unique = df_lh.copy().drop_duplicates(subset='title', keep=False)
df_lh_unique.shape


# In[120]:


df_lh_unique['oa_status']  = df_lh_unique['doi'].progress_apply(get_oa_status)


# ### Results

# In[121]:


df_lh_unique['oa_status'].value_counts()


# In[122]:


df_lh_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'


# In[123]:


chart_oa_status(df_lh_unique, title='Labour History')


# ----
# 
# Created by [Tim Sherratt](https://timsherratt.org)  
# This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.