#!/usr/bin/env python # coding: utf-8 # # Export a Trove list to Zotero # In[1]: import datetime from urllib.parse import urlparse, urlsplit import requests import os.path import tempfile import copy import re from pyzotero import zotero from trove import Trove TROVE_ZOTERO_MAPPINGS = { 'Art work': 'artwork', 'Article': 'journalArticle', 'Article/Book chapter': 'bookSection', 'Article/Conference paper': 'conferencePaper', 'Article/Journal or magazine article': 'journalArticle', 'Article/Other article': 'journalArticle', 'Article/Report': 'report', 'Article/Review': 'journalArticle', 'Article/Working paper': 'report', 'Audio book': 'book', 'Book': 'book', 'Book/Braille': 'book', 'Book/Illustrated': 'book', 'Book/Large print': 'book', 'Conference Proceedings': 'book', 'Data set': 'computerProgram', 'Map': 'map', 'Map/Aerial photograph': 'map', 'Map/Atlas': 'map', 'Map/Braille': 'map', 'Map/Electronic': 'map', 'Map/Globe or object': 'map', 'Map/Large print': 'map', 'Map/Map series': 'map', 'Map/Microform': 'map', 'Map/Single map': 'map', 'Object': 'artwork', 'Periodical': 'book', 'Periodical/Journal, magazine, other': 'book', 'Periodical/Newspaper': 'book', 'Photograph': 'artwork', 'Poster, chart, other': 'artwork', 'Published': 'document', 'Sheet music': 'document', 'Sound': 'audioRecording', 'Sound/Interview, lecture, talk': 'audioRecording', 'Sound/Other sound': 'audioRecording', 'Sound/Recorded music': 'audioRecording', 'Thesis': 'thesis', 'Unpublished': 'manuscript', 'Video': 'videoRecording', 'Video/Captioned': 'videoRecording' } FIELD_MAPPINGS = { #Not including common fields that have multiple values # contributor -> creator # tags -> tags # subject -> tags 'journalArticle': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'isPartOf': 'publicationTitle', 'language': 'language', 'abstract': 'abstractNote', 'pagination': 'pages', 'rights': 'rights', 'source': 'libraryCatalog' }, 'book': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'language': 'language', 'abstract': 'abstractNote', 'publisher': 'publisher', 'edition': 'edition', 'rights': 'rights', 'source': 'libraryCatalog' }, 'bookSection': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'isPartOf': 'bookTitle', 'language': 'language', 'abstract': 'abstractNote', 'pagination': 'pages', 'publisher': 'publisher', 'edition': 'edition', 'rights': 'rights', 'source': 'libraryCatalog' }, 'conferencePaper': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'isPartOf': 'proceedingsTitle', 'language': 'language', 'abstract': 'abstractNote', 'pagination': 'pages', 'publisher': 'publisher', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'report': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'language': 'language', 'abstract': 'abstractNote', 'publisher': 'institution', 'rights': 'rights', 'source': 'libraryCatalog' }, 'thesis': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'language': 'language', 'abstract': 'abstractNote', 'publisher': 'university', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'artwork': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'language': 'language', 'abstract': 'abstractNote', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'map': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'language': 'language', 'abstract': 'abstractNote', 'publisher': 'publisher', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'computerProgram': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'abstract': 'abstractNote', 'publisher': 'company', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'document': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'language': 'language', 'abstract': 'abstractNote', 'publisher': 'publisher', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'manuscript': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'language': 'language', 'abstract': 'abstractNote', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'audioRecording': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'isPartOf': 'seriesTitle', 'language': 'language', 'abstract': 'abstractNote', 'publisher': 'label', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'videoRecording': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'isPartOf': 'seriesTitle', 'language': 'language', 'abstract': 'abstractNote', 'publisher': 'studio', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'encyclopediaArticle': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'isPartOf': 'encyclopediaTitle', 'language': 'language', 'abstract': 'abstractNote', 'publisher': 'publisher', 'repository': 'archive', 'rights': 'rights', 'source': 'libraryCatalog' }, 'webpage': { 'title': 'title', 'troveUrl': 'url', 'issued': 'date', 'isPartOf': 'websiteTitle', 'language': 'language', 'abstract': 'abstractNote', 'rights': 'rights' } } # In[2]: def guess_zotero_type(item_type): ''' Check mappings to try and find a zotero type. ''' if isinstance(item_type, list): item_type = item_type[0] try: zotero_type = TROVE_ZOTERO_MAPPINGS[item_type] except KeyError: zotero_type = 'journalArticle' return zotero_type def process_name(name): ''' Try and do some cleaning of names that have dates included. ''' parts = name.split(',') if len(parts) > 1: family_name = parts[0] other_names = parts[1] else: family_name = parts[0] other_names = '' return {'family_name': family_name, 'other_names': other_names} def extract_filename_from_url(url): ''' Try and get the filename of attachments. ''' filename = os.path.basename(urlsplit(url).path) if not re.search(r'\.jpg|\.gif|\.png|\.tif\.pdf', filename): filename = None return filename def prepare_attachment(url, default): ''' Save a local copy of attachment, and return the local file path. ''' response = requests.get(url) filename = extract_filename_from_url(url) if not filename: filename = default folder = tempfile.gettempdir() filename = os.path.join(folder, filename) with open(filename, 'wb') as attachment: attachment.write(response.content) return filename def prepare_tags(tags): ''' Takes a list of tags and formats in the object format expected by Zotero. ''' return [{'tag': tag} for tag in tags] def get_newspaper_pdf(article_id): ''' Use my proxy app to get the url to the PDF copy of an article. ''' response = requests.get('https://trove-proxy.herokuapp.com/pdf/{}'.format(article_id)) return response.text def create_zotero_object(zotero_api, trove_api, record): ''' Process Trove record to populate fields in Zotero item template. ''' attachments = [] item_type = list(record.keys())[0] item = record[item_type] if item_type == 'work': zotero_type = guess_zotero_type(item['type']) zotero_template = zotero_api.item_template(zotero_type) template = copy.deepcopy(zotero_template) work = trove_api.get_item(item_id=item['id'], item_type='work') details = work.get_details() fields = FIELD_MAPPINGS[zotero_type] for t_field, z_field in fields.items(): if t_field in details: template[z_field] = '; '.join(details[t_field]) if 'contributor' in details: for index, contributor in enumerate(details['contributor']): names = process_name(contributor) template['creators'][index]['firstName'] = names['other_names'] template['creators'][index]['lastName'] = names['family_name'] tags = work.get_all_tags() if 'subject' in details: tags.extend(details['subject']) if tags: template['tags'] = prepare_tags(tags) source = work.get_repository() repository = None if source['nuc']: contributor = trove_api.get_item(item_id=source['nuc'], item_type='contributor') repository = contributor.get_title() elif source['repository']: repository = source['repository'] if repository: template['archive'] = repository urls = work.get_urls() if 'mediumresolution' in urls: image_url = urls['mediumresolution'] elif 'thumbnail' in urls: image_url = urls['thumbnail'] else: image_url = None if image_url: attachments.append(prepare_attachment(image_url, 'image.jpg')) pdf_url = work.get_pdf_url() if pdf_url: attachments.append(prepare_attachment(pdf_url, 'article.pdf')) elif item_type == 'people': zotero_type = 'encyclopediaArticle' template = zotero_api.item_template(zotero_type) template['title'] = 'Trove party record' template['url'] = item['troveUrl'] elif item_type == 'article': zotero_type = 'newspaperArticle' template = zotero_api.item_template(zotero_type) template['title'] = item['heading'] template['url'] = 'http://nla.gov.au/nla.news-article' + item['id'] template['publicationTitle'] = item['title']['value'] template['pages'] = item['page'] template['date'] = item['date'] pdf_url = get_newspaper_pdf(item['id']) attachments.append(prepare_attachment(pdf_url, 'article-{}.pdf'.format(item['id']))) elif item_type == 'externalWebsite': zotero_type = 'webpage' template = zotero_api.item_template(zotero_type) template['title'] = item['title'] template['url'] = item['identifier']['value'] if template['itemType'] != 'webpage': template['libraryCatalog'] = 'Trove' template['accessDate'] = datetime.datetime.now().date().isoformat() return {'zotero_item': template, 'attachments': attachments} def create_zotero_collection(zotero_api, collection_name): ''' Creates a Zotero collection with the given name, then retrieves the key for that collection. ''' collection_key = None if collection_name: created = zotero_api.create_collections([{'name': collection_name}]) if created: collections = zotero_api.collections(q=collection_name) try: collection_key = collections[0]['key'] except (IndexError, KeyError): print('Error retrieving collection key.') else: print('Error creating collection.') return collection_key def check_duplicate_collection(zotero_api, collection_name): ''' Check to see if a collection with the supplied name already exists. ''' collections = zotero_api.collections(q=collection_name) if collections: collection_key = collections[0]['key'] else: collection_key = None return collection_key def export_list(list_id, zotero_api, trove_api): trove_list = trove_api.get_item(item_id=list_id, item_type='list') list_name = '{} (Trove list: {})'.format(trove_list.get_title(), list_id) print(list_name) collection_key = check_duplicate_collection(zotero_api, list_name) if not collection_key: collection_key = create_zotero_collection(zotero_api, list_name) if collection_key: for item in trove_list.list_items: if 'deleted' not in item: details = create_zotero_object(zotero_api, trove_api, item) zotero_item = details['zotero_item'] zotero_item['collections'] = [collection_key] response = zotero_api.create_items([zotero_item]) # print(response) if details['attachments']: zotero_api.attachment_simple(details['attachments'], response['successful']['0']['key']) print('New item added: {}'.format(zotero_item['title'])) # ## Add your details here # In[3]: # This is a temporary group id created for demo purposes, replace it with your own personal or group library id zotero_library_id = '2315662' # Type should be either 'user' or 'group' zotero_library_type = 'group' # either 'user' or 'group' # This is a temporary key created for demonstration purposes zotero_library_key = 'zUjtvoJwuUNTwxKGIbj6t8wt' zot_api = zotero.Zotero(zotero_library_id, zotero_library_type, zotero_library_key) # In[4]: # Replace this with the id of the list you want to export trove_list_id = '83777' # This is a temporary key created for demonstration purposes, replace it with your own trove_api_key = 'ju3rgk0jp354ikmh' trove_api = Trove(trove_api_key) # ## Start the export # In[ ]: export_list(trove_list_id, zot_api, trove_api) # In[ ]: