import datetime
from urllib.parse import urlparse, urlsplit
import requests
import os.path
import tempfile
import copy
import re
from pyzotero import zotero
from trove import Trove
TROVE_ZOTERO_MAPPINGS = {
'Art work': 'artwork',
'Article': 'journalArticle',
'Article/Book chapter': 'bookSection',
'Article/Conference paper': 'conferencePaper',
'Article/Journal or magazine article': 'journalArticle',
'Article/Other article': 'journalArticle',
'Article/Report': 'report',
'Article/Review': 'journalArticle',
'Article/Working paper': 'report',
'Audio book': 'book',
'Book': 'book',
'Book/Braille': 'book',
'Book/Illustrated': 'book',
'Book/Large print': 'book',
'Conference Proceedings': 'book',
'Data set': 'computerProgram',
'Map': 'map',
'Map/Aerial photograph': 'map',
'Map/Atlas': 'map',
'Map/Braille': 'map',
'Map/Electronic': 'map',
'Map/Globe or object': 'map',
'Map/Large print': 'map',
'Map/Map series': 'map',
'Map/Microform': 'map',
'Map/Single map': 'map',
'Object': 'artwork',
'Periodical': 'book',
'Periodical/Journal, magazine, other': 'book',
'Periodical/Newspaper': 'book',
'Photograph': 'artwork',
'Poster, chart, other': 'artwork',
'Published': 'document',
'Sheet music': 'document',
'Sound': 'audioRecording',
'Sound/Interview, lecture, talk': 'audioRecording',
'Sound/Other sound': 'audioRecording',
'Sound/Recorded music': 'audioRecording',
'Thesis': 'thesis',
'Unpublished': 'manuscript',
'Video': 'videoRecording',
'Video/Captioned': 'videoRecording'
}
FIELD_MAPPINGS = {
#Not including common fields that have multiple values
# contributor -> creator
# tags -> tags
# subject -> tags
'journalArticle': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'isPartOf': 'publicationTitle',
'language': 'language',
'abstract': 'abstractNote',
'pagination': 'pages',
'rights': 'rights',
'source': 'libraryCatalog'
},
'book': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'language': 'language',
'abstract': 'abstractNote',
'publisher': 'publisher',
'edition': 'edition',
'rights': 'rights',
'source': 'libraryCatalog'
},
'bookSection': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'isPartOf': 'bookTitle',
'language': 'language',
'abstract': 'abstractNote',
'pagination': 'pages',
'publisher': 'publisher',
'edition': 'edition',
'rights': 'rights',
'source': 'libraryCatalog'
},
'conferencePaper': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'isPartOf': 'proceedingsTitle',
'language': 'language',
'abstract': 'abstractNote',
'pagination': 'pages',
'publisher': 'publisher',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'report': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'language': 'language',
'abstract': 'abstractNote',
'publisher': 'institution',
'rights': 'rights',
'source': 'libraryCatalog'
},
'thesis': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'language': 'language',
'abstract': 'abstractNote',
'publisher': 'university',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'artwork': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'language': 'language',
'abstract': 'abstractNote',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'map': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'language': 'language',
'abstract': 'abstractNote',
'publisher': 'publisher',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'computerProgram': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'abstract': 'abstractNote',
'publisher': 'company',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'document': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'language': 'language',
'abstract': 'abstractNote',
'publisher': 'publisher',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'manuscript': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'language': 'language',
'abstract': 'abstractNote',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'audioRecording': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'isPartOf': 'seriesTitle',
'language': 'language',
'abstract': 'abstractNote',
'publisher': 'label',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'videoRecording': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'isPartOf': 'seriesTitle',
'language': 'language',
'abstract': 'abstractNote',
'publisher': 'studio',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'encyclopediaArticle': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'isPartOf': 'encyclopediaTitle',
'language': 'language',
'abstract': 'abstractNote',
'publisher': 'publisher',
'repository': 'archive',
'rights': 'rights',
'source': 'libraryCatalog'
},
'webpage': {
'title': 'title',
'troveUrl': 'url',
'issued': 'date',
'isPartOf': 'websiteTitle',
'language': 'language',
'abstract': 'abstractNote',
'rights': 'rights'
}
}
def guess_zotero_type(item_type):
'''
Check mappings to try and find a zotero type.
'''
if isinstance(item_type, list):
item_type = item_type[0]
try:
zotero_type = TROVE_ZOTERO_MAPPINGS[item_type]
except KeyError:
zotero_type = 'journalArticle'
return zotero_type
def process_name(name):
'''
Try and do some cleaning of names that have dates included.
'''
parts = name.split(',')
if len(parts) > 1:
family_name = parts[0]
other_names = parts[1]
else:
family_name = parts[0]
other_names = ''
return {'family_name': family_name, 'other_names': other_names}
def extract_filename_from_url(url):
'''
Try and get the filename of attachments.
'''
filename = os.path.basename(urlsplit(url).path)
if not re.search(r'\.jpg|\.gif|\.png|\.tif\.pdf', filename):
filename = None
return filename
def prepare_attachment(url, default):
'''
Save a local copy of attachment, and return the local file path.
'''
response = requests.get(url)
filename = extract_filename_from_url(url)
if not filename:
filename = default
folder = tempfile.gettempdir()
filename = os.path.join(folder, filename)
with open(filename, 'wb') as attachment:
attachment.write(response.content)
return filename
def prepare_tags(tags):
'''
Takes a list of tags and formats in the object format expected by Zotero.
'''
return [{'tag': tag} for tag in tags]
def get_newspaper_pdf(article_id):
'''
Use my proxy app to get the url to the PDF copy of an article.
'''
response = requests.get('https://trove-proxy.herokuapp.com/pdf/{}'.format(article_id))
return response.text
def create_zotero_object(zotero_api, trove_api, record):
'''
Process Trove record to populate fields in Zotero item template.
'''
attachments = []
item_type = list(record.keys())[0]
item = record[item_type]
if item_type == 'work':
zotero_type = guess_zotero_type(item['type'])
zotero_template = zotero_api.item_template(zotero_type)
template = copy.deepcopy(zotero_template)
work = trove_api.get_item(item_id=item['id'], item_type='work')
details = work.get_details()
fields = FIELD_MAPPINGS[zotero_type]
for t_field, z_field in fields.items():
if t_field in details:
template[z_field] = '; '.join(details[t_field])
if 'contributor' in details:
for index, contributor in enumerate(details['contributor']):
names = process_name(contributor)
template['creators'][index]['firstName'] = names['other_names']
template['creators'][index]['lastName'] = names['family_name']
tags = work.get_all_tags()
if 'subject' in details:
tags.extend(details['subject'])
if tags:
template['tags'] = prepare_tags(tags)
source = work.get_repository()
repository = None
if source['nuc']:
contributor = trove_api.get_item(item_id=source['nuc'], item_type='contributor')
repository = contributor.get_title()
elif source['repository']:
repository = source['repository']
if repository:
template['archive'] = repository
urls = work.get_urls()
if 'mediumresolution' in urls:
image_url = urls['mediumresolution']
elif 'thumbnail' in urls:
image_url = urls['thumbnail']
else:
image_url = None
if image_url:
attachments.append(prepare_attachment(image_url, 'image.jpg'))
pdf_url = work.get_pdf_url()
if pdf_url:
attachments.append(prepare_attachment(pdf_url, 'article.pdf'))
elif item_type == 'people':
zotero_type = 'encyclopediaArticle'
template = zotero_api.item_template(zotero_type)
template['title'] = 'Trove party record'
template['url'] = item['troveUrl']
elif item_type == 'article':
zotero_type = 'newspaperArticle'
template = zotero_api.item_template(zotero_type)
template['title'] = item['heading']
template['url'] = 'http://nla.gov.au/nla.news-article' + item['id']
template['publicationTitle'] = item['title']['value']
template['pages'] = item['page']
template['date'] = item['date']
pdf_url = get_newspaper_pdf(item['id'])
attachments.append(prepare_attachment(pdf_url, 'article-{}.pdf'.format(item['id'])))
elif item_type == 'externalWebsite':
zotero_type = 'webpage'
template = zotero_api.item_template(zotero_type)
template['title'] = item['title']
template['url'] = item['identifier']['value']
if template['itemType'] != 'webpage':
template['libraryCatalog'] = 'Trove'
template['accessDate'] = datetime.datetime.now().date().isoformat()
return {'zotero_item': template, 'attachments': attachments}
def create_zotero_collection(zotero_api, collection_name):
'''
Creates a Zotero collection with the given name,
then retrieves the key for that collection.
'''
collection_key = None
if collection_name:
created = zotero_api.create_collections([{'name': collection_name}])
if created:
collections = zotero_api.collections(q=collection_name)
try:
collection_key = collections[0]['key']
except (IndexError, KeyError):
print('Error retrieving collection key.')
else:
print('Error creating collection.')
return collection_key
def check_duplicate_collection(zotero_api, collection_name):
'''
Check to see if a collection with the supplied name already exists.
'''
collections = zotero_api.collections(q=collection_name)
if collections:
collection_key = collections[0]['key']
else:
collection_key = None
return collection_key
def export_list(list_id, zotero_api, trove_api):
trove_list = trove_api.get_item(item_id=list_id, item_type='list')
list_name = '{} (Trove list: {})'.format(trove_list.get_title(), list_id)
print(list_name)
collection_key = check_duplicate_collection(zotero_api, list_name)
if not collection_key:
collection_key = create_zotero_collection(zotero_api, list_name)
if collection_key:
for item in trove_list.list_items:
if 'deleted' not in item:
details = create_zotero_object(zotero_api, trove_api, item)
zotero_item = details['zotero_item']
zotero_item['collections'] = [collection_key]
response = zotero_api.create_items([zotero_item])
# print(response)
if details['attachments']:
zotero_api.attachment_simple(details['attachments'], response['successful']['0']['key'])
print('New item added: {}'.format(zotero_item['title']))
# This is a temporary group id created for demo purposes, replace it with your own personal or group library id
zotero_library_id = '2315662'
# Type should be either 'user' or 'group'
zotero_library_type = 'group' # either 'user' or 'group'
# This is a temporary key created for demonstration purposes
zotero_library_key = 'zUjtvoJwuUNTwxKGIbj6t8wt'
zot_api = zotero.Zotero(zotero_library_id, zotero_library_type, zotero_library_key)
# Replace this with the id of the list you want to export
trove_list_id = '83777'
# This is a temporary key created for demonstration purposes, replace it with your own
trove_api_key = 'ju3rgk0jp354ikmh'
trove_api = Trove(trove_api_key)
export_list(trove_list_id, zot_api, trove_api)