import collections
import csv
import gzip
import itertools
import json
import lzma
import pandas
def _process_unpaywall_record(row):
row['doi'] = row['doi'].lower()
row['journal_access'] = False
row['journal_access_license'] = None
row['journal_access_evidence'] = None
for location in row['oa_locations']:
if location['host_type'] == 'publisher':
row['journal_access'] = True
row['journal_access_license'] = location.get('license')
row['journal_access_evidence'] = location.get('evidence')
break
row['journal_date'] = row.get('published_date') or row.get('year')
journal_date = row.get('published_date')
journal_issns = row.get('journal_issns')
row['journal_issns'] = journal_issns.split(',') if journal_issns else []
def read_unpaywall_snapshot(path, doi_subset=None):
"""
https://unpaywall.org/data-format
https://unpaywall.org/products/snapshot
"""
opener = gzip.open if str(path).endswith('.gz') else open
with opener(path, 'rt') as read_file:
for line in read_file:
row = json.loads(line)
if doi_subset is None or row['doi'].lower() in doi_subset:
_process_unpaywall_record(row)
yield row
record_renamer = {
'doi': 'doi',
'genre': 'crossref_type',
'journal_date': 'journal_date',
'is_oa': 'unpaywall_access',
'journal_access': 'journal_access',
'journal_access_evidence': 'journal_access_evidence',
'journal_access_license': 'journal_access_license',
'journal_is_oa': 'journal_fully_oa',
}
def _reduce_unpaywall_record(row):
reduced_row = collections.OrderedDict()
for key, renamed_key in record_renamer.items():
value = row[key]
if isinstance(value, bool):
value = int(value)
reduced_row[renamed_key] = value
return reduced_row
# Input path
path_jsonl = 'downloads/unpaywall/unpaywall_snapshot_2018-09-24T232615.jsonl.gz'
# Output paths
path_access_tsv = 'data/02.unpaywall-access.tsv.xz'
path_issn_tsv = 'data/02.unpaywall-issns.tsv.xz'
articles = read_unpaywall_snapshot(path_jsonl)
# Uncomment following line for development
# articles = itertools.islice(articles, 100)
with lzma.open(path_access_tsv, 'wt') as access_file, lzma.open(path_issn_tsv, 'wt') as issn_file:
access_writer = csv.DictWriter(access_file, delimiter='\t', fieldnames=list(record_renamer.values()))
access_writer.writeheader()
issn_writer = csv.writer(issn_file, delimiter='\t')
issn_writer.writerow(('doi', 'issn'))
for article in articles:
doi = article['doi']
access_writer.writerow(_reduce_unpaywall_record(article))
for issn in article['journal_issns']:
issn_writer.writerow((doi, issn))
article
{'doi': '10.1002/asi.20570', 'year': 2007, 'genre': 'journal-article', 'is_oa': True, 'title': 'Towards memory supporting personal information management tools', 'doi_url': 'https://doi.org/10.1002/asi.20570', 'updated': '2018-06-21T04:54:17.294334', 'publisher': 'Wiley-Blackwell', 'z_authors': [{'given': 'David', 'family': 'Elsweiler'}, {'given': 'Ian', 'family': 'Ruthven'}, {'given': 'Christopher', 'family': 'Jones'}], 'journal_name': 'Journal of the American Society for Information Science and Technology', 'oa_locations': [{'url': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf', 'pmh_id': 'oai:epub.uni-regensburg.de:22679', 'is_best': True, 'license': None, 'updated': '2018-06-20T18:33:20.945686', 'version': 'submittedVersion', 'evidence': 'oa repository (via OAI-PMH title and first author match)', 'host_type': 'repository', 'url_for_pdf': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf', 'url_for_landing_page': None}, {'url': 'https://strathprints.strath.ac.uk/2395/6/strathprints002395.pdf', 'pmh_id': 'oai:strathprints.strath.ac.uk:2395', 'is_best': False, 'license': None, 'updated': '2018-09-22T09:24:03.567239', 'version': 'submittedVersion', 'evidence': 'oa repository (via OAI-PMH title and first author match)', 'host_type': 'repository', 'url_for_pdf': 'https://strathprints.strath.ac.uk/2395/6/strathprints002395.pdf', 'url_for_landing_page': None}], 'data_standard': 2, 'journal_is_oa': False, 'journal_issns': ['1532-2882', '1532-2890'], 'published_date': '2007-01-01', 'best_oa_location': {'url': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf', 'pmh_id': 'oai:epub.uni-regensburg.de:22679', 'is_best': True, 'license': None, 'updated': '2018-06-20T18:33:20.945686', 'version': 'submittedVersion', 'evidence': 'oa repository (via OAI-PMH title and first author match)', 'host_type': 'repository', 'url_for_pdf': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf', 'url_for_landing_page': None}, 'journal_is_in_doaj': False, 'journal_access': False, 'journal_access_license': None, 'journal_access_evidence': None, 'journal_date': '2007-01-01'}
unpaywall-access.tsv
¶article_df = pandas.read_csv(path_access_tsv, sep='\t')
article_df.tail()
doi | crossref_type | journal_date | unpaywall_access | journal_access | journal_access_evidence | journal_access_license | journal_fully_oa | |
---|---|---|---|---|---|---|---|---|
99940224 | 10.1002/nadc.19970450625 | journal-article | 1997-06-01 | 0 | 0 | NaN | NaN | 0 |
99940225 | 10.1371/journal.pbio.1001712.g004 | component | NaN | 1 | 1 | oa journal (via publisher name) | NaN | 1 |
99940226 | 10.1364/opex.12.002220.m005 | component | NaN | 0 | 0 | NaN | NaN | 0 |
99940227 | 10.2105/ajph.10.6.536 | journal-article | 1920-06-01 | 1 | 1 | open (via free pdf) | NaN | 0 |
99940228 | 10.1002/asi.20570 | journal-article | 2007-01-01 | 1 | 0 | NaN | NaN | 0 |
# Green OA only articles
article_df.query("journal_access==0 and unpaywall_access==1").head(3)
doi | crossref_type | journal_date | unpaywall_access | journal_access | journal_access_evidence | journal_access_license | journal_fully_oa | |
---|---|---|---|---|---|---|---|---|
0 | 10.1080/21645515.2017.1330236 | journal-article | 2017-06-12 | 1 | 0 | NaN | NaN | 0 |
49 | 10.1016/j.drugalcdep.2016.08.636 | journal-article | 2016-11-01 | 1 | 0 | NaN | NaN | 0 |
54 | 10.1109/icecs.2001.957596 | proceedings-article | NaN | 1 | 0 | NaN | NaN | 0 |
# Hybrid/Bronze OA articles
article_df.query("journal_access==1 and journal_fully_oa==0").head(3)
doi | crossref_type | journal_date | unpaywall_access | journal_access | journal_access_evidence | journal_access_license | journal_fully_oa | |
---|---|---|---|---|---|---|---|---|
3 | 10.1088/0004-6256/135/4/1201 | journal-article | 2008-03-04 | 1 | 1 | open (via free pdf) | NaN | 0 |
5 | 10.2478/v10172-012-0058-8 | journal-article | 2012-01-01 | 1 | 1 | open (via free pdf) | NaN | 0 |
13 | 10.1038/313176c0 | journal-article | 1985-01-01 | 1 | 1 | open (via free pdf) | NaN | 0 |
unpaywall-issns.tsv
¶issn_df = pandas.read_csv(path_issn_tsv, sep='\t')
issn_df.tail()
doi | issn | |
---|---|---|
112751994 | 10.1006/bbrc.1997.6706 | 0006-291X |
112751995 | 10.1002/nadc.19970450625 | 0341-5163 |
112751996 | 10.2105/ajph.10.6.536 | 0271-4353 |
112751997 | 10.1002/asi.20570 | 1532-2882 |
112751998 | 10.1002/asi.20570 | 1532-2890 |
# Make sure this dataframe is empty
# all ISSNs should be length nine, i.e. XXXX-XXXX formatted
invalid_issn_df = issn_df[issn_df.issn.str.len() != 9]
invalid_issn_df.head()
doi | issn |
---|