#!/usr/bin/env python # coding: utf-8 # # Open Access versions of articles in Australian HASS journals # # Previously I [attempted some analysis](finding-oa-versions-of-AHS-articles.ipynb) of the open access status of research articles published in _Australian Historical Studies_. I thought it would be interesting to try some comparisons with other Australian HASS subscription-based journals. # # I've simplified the process here to make it easier run an analysis of any journal. The steps are: # # 1. Get a list of articles published in the journal by querying the [CrossRef API](https://www.crossref.org/education/retrieve-metadata/rest-api/) with the journal's ISSN. # 2. Remove recurring sections such as 'Editorial' and 'Book reviews' from the list of articles. # 3. Look up the OA status of each remaining article by querying the [Unpaywall API](https://unpaywall.org/products/api) with the article's DOI. # # I then do some simple analysis of the OA status, and visualise the results over time. # # ## Understanding the OA status # # Theh Unpaywall API returns one of five values for the OA status of an article – 'Gold', 'Hybrid', 'Green', 'Bronze', and 'Closed'. There's some [more information on how these are determined](https://support.unpaywall.org/support/solutions/articles/44001777288) on the Unpaywall site. Put simply: # # * **Gold** – the article is freely available, openly licensed, and published in an open access journal # * **Hybrid** – the article is freely available, openly licensed, and published in a subscription journal # * **Green** – a version of the article (usually the Author's Accepted Manuscript) is freely available from a public repository # * **Bronze** – the article is published in a subscription journal, but is freely available from the journal's website # * **Closed** – the article is behind a paywall # # ## Caveats # # * The data might not be up-to-date. In particular, I've noticed that some 'bronze' status articles are reported as 'closed'. Presumably this is because the Unpaywall database is running a bit behind changes in the publishers' websites. # * The definition of an 'article' is not consistent. In earlier issues of some journals it seems that things like book reviews are grouped together under a single DOI, while recent issues have a DOI for each review. # # ## Journals # # So far I've looked at the following journals (more suggestions welcome): # # * [Australian Historical Studies](#Australian-Historical-Studies) # * [History Australia](#History-Australia) # * [Australian Journal of Politics and History](#Australian-Journal-of-Politics-and-History) # * [Journal of Australian Studies](#Journal-of-Australian-Studies) # * [Australian Archaeology](#Australian-Archaeology) # * [Archives and Manuscripts](#Archives-and-Manuscripts) # * [Journal of the Australian Library and Information Association](#Journal-of-the-Australian-Library-and-Information-Association) # * [Labour History](#Labour-History) # # Of course, this analysis is focused on subscription journals. There are also open access journals like the [Public History Review](https://epress.lib.uts.edu.au/journals/index.php/phrj) where all the articles would be 'Gold'! # # ## Results (12 January 2021) # # The results are not good. Articles published in Australia's main subscription history journals are about **94% closed**. This is despite the fact that Green OA policies allow authors to deposit versions of their articles in public repositories (often after an embargo period). # # | Journal | Closed | # |----|----| # |Australian Historical Studies|94.6%| # |History Australia|94.9%| # |Australian Journal of Politics and History|95.7%*| # |Journal of Australian Studies|94.2%| # |Australian Archaeology|83.4%| # |Archives and Manuscripts (2012-)|24.8%| # |Journal of the Australian Library and Information Association|52.5%*| # |Labour History|93.9%| # |* Problems with data noted below| # # **This can be fixed!** If you're in a university, talk to your librarians about depositing a Green OA version of your article in an institutional repository. If not, you can use the [Share your paper](https://shareyourpaper.org/) service to upload a Green OA version to Zenodo. Your research will be easier to find, easier to access, easier to use, and available to everyone – not just those with the luxury of an institutional subscription. # # ## Import what we need # In[34]: import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry import requests_cache from tqdm.auto import tqdm import pandas as pd import altair as alt import collections s = requests_cache.CachedSession() retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ]) s.mount('https://', HTTPAdapter(max_retries=retries)) s.mount('http://', HTTPAdapter(max_retries=retries)) tqdm.pandas(desc="records") # In[35]: # the APIs are open, but it's polite to let the APIs know who you are email = 'tim@discontents.com.au' # ## Define some functions to do the work # In[36]: def get_total_results(issn): ''' Get the total number of articles in CrossRef for this journal. ''' response = s.get(f'https://api.crossref.org/journals/{issn}/works/', params={'rows': 0}) data = response.json() try: total_works = data['message']['total-results'] except KeyError: total_works = 0 return total_works def get_title(record): ''' Titles are in a list – join any values ''' title = record.get('title') if isinstance(title, list): title = ' – '.join(title) return title def harvest_works(issn): ''' Harvest basic details (DOI, title, date) of articles from the journal with the supplied ISSN from CrossRef. ''' harvested = 0 works = [] total_results = get_total_results(issn) params = { 'rows': 100, 'offset': 0 } headers = { 'User-Agent': f'Jupyter Notebook (mailto:{email})' } with tqdm(total=total_results) as pbar: while harvested <= total_results: params['offset'] = harvested response = s.get(f'https://api.crossref.org/journals/{issn}/works/', params=params, headers=headers) data = response.json() try: records = data['message']['items'] except TypeError: print('TYPEERROR') print(data) else: for record in records: try: works.append({'doi': record.get('DOI'), 'title': get_title(record), 'year': record['issued']['date-parts'][0][0]}) except KeyError: print('KEYERROR') print(record) harvested += 100 pbar.update(len(data['message']['items'])) return works def get_oa_status(doi): ''' Get OA status of DOI from the Unpaywall API. ''' response = s.get(f'https://api.unpaywall.org/v2/{doi}?email={email}') data = response.json() return data['oa_status'] def create_scale(df): ''' Set colour range to match the OA status types. ''' scale = [] colours = collections.OrderedDict() colours['hybrid'] = 'gold' colours['green'] = 'green' colours['bronze'] = 'brown' colours['closed'] = 'lightgrey' status_values = list(df['oa_status'].unique()) for status, colour in colours.items(): if status in status_values: scale.append(colour) return scale def chart_oa_status(df, title): # Adding a numeric order column makes it easy to sort by oa_status df['order'] = df['oa_status'].replace({val: i for i, val in enumerate(['closed', 'bronze', 'green', 'hybrid'])}) # Get colour values scale = create_scale(df) chart = alt.Chart(df).mark_bar().encode( x=alt.X('year:O', title='Year'), y=alt.Y('count():Q', title='Number of articles', axis=alt.Axis(tickMinStep=1)), color=alt.Color('oa_status:N', scale=alt.Scale(range=scale), legend=alt.Legend(title='OA type'), sort=alt.EncodingSortField('order', order='descending')), order='order', tooltip=[alt.Tooltip('count():Q', title='Number of articles'), alt.Tooltip('oa_status', title='OA type')] ).properties(title=title) display(chart) # ## Australian Historical Studies # # * ISSN: 1031-461X # * [Website](https://www.tandfonline.com/toc/rahs20/current) # In[37]: works_ahs = harvest_works('1031-461X') # In[38]: df_ahs = pd.DataFrame(works_ahs) df_ahs.shape # In[39]: # Make sure there's no duplicates df_ahs.drop_duplicates(inplace=True) df_ahs.shape # In[40]: # Show repeated titles df_ahs['title'].value_counts()[:25] # In[41]: # Get rid of titles that appear more than once df_ahs_unique = df_ahs.copy().drop_duplicates(subset='title', keep=False) df_ahs_unique.shape # In[42]: df_ahs_unique['oa_status'] = df_ahs_unique['doi'].progress_apply(get_oa_status) # ### Results # In[43]: df_ahs_unique['oa_status'].value_counts() # In[44]: df_ahs_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%' # In[45]: chart_oa_status(df_ahs_unique, title='Australian Historical Studies') # ## History Australia # # * ISSN: 1449-0854 # * [Website](https://www.tandfonline.com/toc/raha20/current) # * [Archived issues in Trove](https://webarchive.nla.gov.au/tep/46522) # In[46]: works_ha = harvest_works('1449-0854') # In[47]: df_ha = pd.DataFrame(works_ha) df_ha.shape # In[48]: df_ha.drop_duplicates(inplace=True) df_ha.shape # In[49]: df_ha.loc[df_ha['title'].isnull()] # In[50]: df_ha.dropna(subset=['title'], inplace=True) df_ha.shape # In[51]: df_ha['title'].value_counts()[:30] # In[52]: df_ha_unique = df_ha.copy().drop_duplicates(subset='title', keep=False) df_ha_unique.shape # In[53]: df_ha_unique['oa_status'] = df_ha_unique['doi'].progress_apply(get_oa_status) # ### Results # In[54]: df_ha_unique['oa_status'].value_counts() # In[55]: df_ha_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%' # In[56]: chart_oa_status(df_ha_unique, title='History Australia') # ## Australian Journal of Politics and History # # * ISSN: 1467-8497 # * [Website](https://onlinelibrary.wiley.com/journal/14678497) # # There's clearly some problems with dates in the CrossRef data. # In[57]: works_ajph = harvest_works('1467-8497') # In[58]: df_ajph = pd.DataFrame(works_ajph) df_ajph.shape # In[59]: df_ajph.drop_duplicates(inplace=True) df_ajph.shape # In[60]: df_ajph.loc[df_ajph['title'].isnull()] # In[61]: df_ajph.dropna(subset=['title'], inplace=True) df_ajph.shape # In[62]: df_ajph['title'].value_counts()[:40] # In[63]: df_ajph_unique = df_ajph.copy().drop_duplicates(subset='title', keep=False) df_ajph_unique.shape # In[64]: df_ajph_unique['oa_status'] = df_ajph_unique['doi'].progress_apply(get_oa_status) # ## Results # In[65]: df_ajph_unique['oa_status'].value_counts() # In[66]: df_ajph_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%' # In[67]: chart_oa_status(df_ajph_unique, title='Australian Journal of Politics and History') # ## Journal of Australian Studies # # * ISSN: 1444-3058 # * [Website](https://www.tandfonline.com/toc/rjau20/current) # In[68]: works_jas = harvest_works('1444-3058') # In[69]: df_jas = pd.DataFrame(works_jas) df_jas.shape # In[70]: df_jas.drop_duplicates(inplace=True) df_jas.shape # In[71]: df_jas.loc[df_jas['title'].isnull()] # In[72]: df_jas.dropna(subset=['title'], inplace=True) df_jas.shape # In[73]: df_jas['title'].value_counts()[:30] # In[74]: df_jas_unique = df_jas.copy().drop_duplicates(subset='title', keep=False) df_jas_unique.shape # In[75]: df_jas_unique['oa_status'] = df_jas_unique['doi'].progress_apply(get_oa_status) # ### Results # In[76]: df_jas_unique['oa_status'].value_counts() # In[77]: df_jas_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%' # In[78]: chart_oa_status(df_jas_unique, title='Journal of Australian Studies') # ## Australian Archaeology # # * ISSN: 0312-2417 # * [Website](https://www.tandfonline.com/toc/raaa20/current) # In[79]: works_aa = harvest_works('0312-2417') # In[80]: df_aa = pd.DataFrame(works_aa) df_aa.shape # In[81]: df_aa.drop_duplicates(inplace=True) df_aa.shape # In[82]: df_aa.loc[df_aa['title'].isnull()] # In[83]: df_aa.dropna(subset=['title'], inplace=True) df_aa.shape # In[84]: df_aa['title'].value_counts()[:30] # In[85]: df_aa_unique = df_aa.copy().drop_duplicates(subset='title', keep=False) df_aa_unique.shape # In[86]: df_aa_unique['oa_status'] = df_aa_unique['doi'].progress_apply(get_oa_status) # ### Results # In[87]: df_aa_unique['oa_status'].value_counts() # In[88]: df_aa_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%' # In[89]: chart_oa_status(df_aa_unique, title='Australian Archaeology') # ## Archives and Manuscripts # # * ISSN: 0157-6895 # * [Website](https://www.tandfonline.com/toc/raam20/current) # # Note that articles published before 2012 are available through an [open access repository](https://publications.archivists.org.au/index.php/asa). # In[90]: works_am = harvest_works('0157-6895') # In[91]: df_am = pd.DataFrame(works_am) df_am.shape # In[92]: df_am.drop_duplicates(inplace=True) df_am.shape # In[93]: df_am.loc[df_am['title'].isnull()] # In[94]: df_am.dropna(subset=['title'], inplace=True) df_am.shape # In[95]: df_am['title'].value_counts()[:30] # In[96]: df_am_unique = df_am.copy().drop_duplicates(subset='title', keep=False) df_am_unique.shape # In[97]: df_am_unique['oa_status'] = df_am_unique['doi'].progress_apply(get_oa_status) # ### Results # In[98]: df_am_unique['oa_status'].value_counts() # In[99]: df_am_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%' # In[100]: chart_oa_status(df_am_unique, title='Archives and Manuscripts') # ## Journal of the Australian Library and Information Association # # * ISSN: 2475-0158 # * [Website](https://www.tandfonline.com/toc/ualj21/current) # # Previously _Australian Academic and Research Libraries_ # # * ISSN: 0004-8623 # * [Website](https://www.tandfonline.com/toc/uarl20/current) # * [Archived issues available in Trove](https://webarchive.nla.gov.au/awa/20130209041025/http://pandora.nla.gov.au/pan/128690/20130208-0850/www.alia.org.au/publishing/aarl/index.html) # # Note that most of AARL seems to be 'bronze', but is not being accurately reported by the Unpaywall API. # In[101]: works_aarn = harvest_works('0004-8623') # In[102]: works_jalia = harvest_works('2475-0158') # In[103]: df_jalia = pd.concat([pd.DataFrame(works_aarn), pd.DataFrame(works_jalia)]) df_jalia.shape # In[104]: df_jalia.drop_duplicates(inplace=True) df_jalia.shape # In[105]: df_jalia.loc[df_jalia['title'].isnull()] # In[106]: df_jalia.dropna(subset=['title'], inplace=True) df_jalia.shape # In[107]: df_jalia['title'].value_counts()[:40] # In[108]: df_jalia_unique = df_jalia.copy().drop_duplicates(subset='title', keep=False) df_jalia_unique.shape # In[109]: df_jalia_unique['oa_status'] = df_jalia_unique['doi'].progress_apply(get_oa_status) # ### Results # In[110]: df_jalia_unique['oa_status'].value_counts() # In[111]: df_jalia_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%' # In[112]: chart_oa_status(df_jalia_unique, title='Journal of the Australian Library and Information Association') # ## Labour History # # * ISSN: 0023-6942 # * [Website](https://www.liverpooluniversitypress.co.uk/journals/id/55https://www.liverpooluniversitypress.co.uk/journals/id/55) # In[113]: works_lh = harvest_works('0023-6942') # In[114]: df_lh = pd.DataFrame(works_lh) df_lh.shape # In[115]: df_lh.drop_duplicates(inplace=True) df_lh.shape # In[116]: df_lh.loc[df_lh['title'].isnull()] # In[117]: df_lh.dropna(subset=['title'], inplace=True) df_lh.shape # In[118]: df_lh['title'].value_counts()[:30] # In[119]: df_lh_unique = df_lh.copy().drop_duplicates(subset='title', keep=False) df_lh_unique.shape # In[120]: df_lh_unique['oa_status'] = df_lh_unique['doi'].progress_apply(get_oa_status) # ### Results # In[121]: df_lh_unique['oa_status'].value_counts() # In[122]: df_lh_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%' # In[123]: chart_oa_status(df_lh_unique, title='Labour History') # ---- # # Created by [Tim Sherratt](https://timsherratt.org) # This work is licensed under a Creative Commons Attribution 4.0 International License.