Previously I attempted some analysis of the open access status of research articles published in Australian Historical Studies. I thought it would be interesting to try some comparisons with other Australian HASS subscription-based journals.
I've simplified the process here to make it easier run an analysis of any journal. The steps are:
I then do some simple analysis of the OA status, and visualise the results over time.
Theh Unpaywall API returns one of five values for the OA status of an article – 'Gold', 'Hybrid', 'Green', 'Bronze', and 'Closed'. There's some more information on how these are determined on the Unpaywall site. Put simply:
So far I've looked at the following journals (more suggestions welcome):
Of course, this analysis is focused on subscription journals. There are also open access journals like the Public History Review where all the articles would be 'Gold'!
The results are not good. Articles published in Australia's main subscription history journals are about 94% closed. This is despite the fact that Green OA policies allow authors to deposit versions of their articles in public repositories (often after an embargo period).
Journal | Closed |
---|---|
Australian Historical Studies | 94.6% |
History Australia | 94.9% |
Australian Journal of Politics and History | 95.7%* |
Journal of Australian Studies | 94.2% |
Australian Archaeology | 83.4% |
Archives and Manuscripts (2012-) | 24.8% |
Journal of the Australian Library and Information Association | 52.5%* |
Labour History | 93.9% |
* Problems with data noted below |
This can be fixed! If you're in a university, talk to your librarians about depositing a Green OA version of your article in an institutional repository. If not, you can use the Share your paper service to upload a Green OA version to Zenodo. Your research will be easier to find, easier to access, easier to use, and available to everyone – not just those with the luxury of an institutional subscription.
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests_cache
from tqdm.auto import tqdm
import pandas as pd
import altair as alt
import collections
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))
tqdm.pandas(desc="records")
# the APIs are open, but it's polite to let the APIs know who you are
email = '[email protected]'
def get_total_results(issn):
'''
Get the total number of articles in CrossRef for this journal.
'''
response = s.get(f'https://api.crossref.org/journals/{issn}/works/', params={'rows': 0})
data = response.json()
try:
total_works = data['message']['total-results']
except KeyError:
total_works = 0
return total_works
def get_title(record):
'''
Titles are in a list – join any values
'''
title = record.get('title')
if isinstance(title, list):
title = ' – '.join(title)
return title
def harvest_works(issn):
'''
Harvest basic details (DOI, title, date) of articles from the journal with the supplied ISSN from CrossRef.
'''
harvested = 0
works = []
total_results = get_total_results(issn)
params = {
'rows': 100,
'offset': 0
}
headers = {
'User-Agent': f'Jupyter Notebook (mailto:{email})'
}
with tqdm(total=total_results) as pbar:
while harvested <= total_results:
params['offset'] = harvested
response = s.get(f'https://api.crossref.org/journals/{issn}/works/', params=params, headers=headers)
data = response.json()
try:
records = data['message']['items']
except TypeError:
print('TYPEERROR')
print(data)
else:
for record in records:
try:
works.append({'doi': record.get('DOI'), 'title': get_title(record), 'year': record['issued']['date-parts'][0][0]})
except KeyError:
print('KEYERROR')
print(record)
harvested += 100
pbar.update(len(data['message']['items']))
return works
def get_oa_status(doi):
'''
Get OA status of DOI from the Unpaywall API.
'''
response = s.get(f'https://api.unpaywall.org/v2/{doi}?email={email}')
data = response.json()
return data['oa_status']
def create_scale(df):
'''
Set colour range to match the OA status types.
'''
scale = []
colours = collections.OrderedDict()
colours['hybrid'] = 'gold'
colours['green'] = 'green'
colours['bronze'] = 'brown'
colours['closed'] = 'lightgrey'
status_values = list(df['oa_status'].unique())
for status, colour in colours.items():
if status in status_values:
scale.append(colour)
return scale
def chart_oa_status(df, title):
# Adding a numeric order column makes it easy to sort by oa_status
df['order'] = df['oa_status'].replace({val: i for i, val in enumerate(['closed', 'bronze', 'green', 'hybrid'])})
# Get colour values
scale = create_scale(df)
chart = alt.Chart(df).mark_bar().encode(
x=alt.X('year:O', title='Year'),
y=alt.Y('count():Q', title='Number of articles', axis=alt.Axis(tickMinStep=1)),
color=alt.Color('oa_status:N', scale=alt.Scale(range=scale), legend=alt.Legend(title='OA type'), sort=alt.EncodingSortField('order', order='descending')),
order='order',
tooltip=[alt.Tooltip('count():Q', title='Number of articles'), alt.Tooltip('oa_status', title='OA type')]
).properties(title=title)
display(chart)
works_ahs = harvest_works('1031-461X')
df_ahs = pd.DataFrame(works_ahs)
df_ahs.shape
# Make sure there's no duplicates
df_ahs.drop_duplicates(inplace=True)
df_ahs.shape
# Show repeated titles
df_ahs['title'].value_counts()[:25]
# Get rid of titles that appear more than once
df_ahs_unique = df_ahs.copy().drop_duplicates(subset='title', keep=False)
df_ahs_unique.shape
df_ahs_unique['oa_status'] = df_ahs_unique['doi'].progress_apply(get_oa_status)
df_ahs_unique['oa_status'].value_counts()
df_ahs_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
chart_oa_status(df_ahs_unique, title='Australian Historical Studies')
works_ha = harvest_works('1449-0854')
df_ha = pd.DataFrame(works_ha)
df_ha.shape
df_ha.drop_duplicates(inplace=True)
df_ha.shape
df_ha.loc[df_ha['title'].isnull()]
df_ha.dropna(subset=['title'], inplace=True)
df_ha.shape
df_ha['title'].value_counts()[:30]
df_ha_unique = df_ha.copy().drop_duplicates(subset='title', keep=False)
df_ha_unique.shape
df_ha_unique['oa_status'] = df_ha_unique['doi'].progress_apply(get_oa_status)
df_ha_unique['oa_status'].value_counts()
df_ha_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
chart_oa_status(df_ha_unique, title='History Australia')
works_ajph = harvest_works('1467-8497')
df_ajph = pd.DataFrame(works_ajph)
df_ajph.shape
df_ajph.drop_duplicates(inplace=True)
df_ajph.shape
df_ajph.loc[df_ajph['title'].isnull()]
df_ajph.dropna(subset=['title'], inplace=True)
df_ajph.shape
df_ajph['title'].value_counts()[:40]
df_ajph_unique = df_ajph.copy().drop_duplicates(subset='title', keep=False)
df_ajph_unique.shape
df_ajph_unique['oa_status'] = df_ajph_unique['doi'].progress_apply(get_oa_status)
df_ajph_unique['oa_status'].value_counts()
df_ajph_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
chart_oa_status(df_ajph_unique, title='Australian Journal of Politics and History')
works_jas = harvest_works('1444-3058')
df_jas = pd.DataFrame(works_jas)
df_jas.shape
df_jas.drop_duplicates(inplace=True)
df_jas.shape
df_jas.loc[df_jas['title'].isnull()]
df_jas.dropna(subset=['title'], inplace=True)
df_jas.shape
df_jas['title'].value_counts()[:30]
df_jas_unique = df_jas.copy().drop_duplicates(subset='title', keep=False)
df_jas_unique.shape
df_jas_unique['oa_status'] = df_jas_unique['doi'].progress_apply(get_oa_status)
df_jas_unique['oa_status'].value_counts()
df_jas_unique['oa_status'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
chart_oa_status(df_jas_unique, title='Journal of Australian Studies')
works_aa = harvest_works('0312-2417')
df_aa = pd.DataFrame(works_aa)
df_aa.shape
df_aa.drop_duplicates(inplace=True)
df_aa.shape
df_aa.loc[df_aa['title'].isnull()]
df_aa.dropna(subset=['title'], inplace=True)
df_aa.shape
df_aa['title'].value_counts()[:30]