Notebook

Export a Trove list to Zotero¶

In [1]:

import datetime
from urllib.parse import urlparse, urlsplit
import requests
import os.path
import tempfile
import copy
import re
from pyzotero import zotero
from trove import Trove

TROVE_ZOTERO_MAPPINGS = {
    'Art work': 'artwork',
    'Article': 'journalArticle',
    'Article/Book chapter': 'bookSection',
    'Article/Conference paper': 'conferencePaper',
    'Article/Journal or magazine article': 'journalArticle',
    'Article/Other article': 'journalArticle',
    'Article/Report': 'report',
    'Article/Review': 'journalArticle',
    'Article/Working paper': 'report',
    'Audio book': 'book',
    'Book': 'book',
    'Book/Braille': 'book',
    'Book/Illustrated': 'book',
    'Book/Large print': 'book',
    'Conference Proceedings': 'book',
    'Data set': 'computerProgram',
    'Map': 'map',
    'Map/Aerial photograph': 'map',
    'Map/Atlas': 'map',
    'Map/Braille': 'map',
    'Map/Electronic': 'map',
    'Map/Globe or object': 'map',
    'Map/Large print': 'map',
    'Map/Map series': 'map',
    'Map/Microform': 'map',
    'Map/Single map': 'map',
    'Object': 'artwork',
    'Periodical': 'book',
    'Periodical/Journal, magazine, other': 'book',
    'Periodical/Newspaper': 'book',
    'Photograph': 'artwork',
    'Poster, chart, other': 'artwork',
    'Published': 'document',
    'Sheet music': 'document',
    'Sound': 'audioRecording',
    'Sound/Interview, lecture, talk': 'audioRecording',
    'Sound/Other sound': 'audioRecording',
    'Sound/Recorded music': 'audioRecording',
    'Thesis': 'thesis',
    'Unpublished': 'manuscript',
    'Video': 'videoRecording',
    'Video/Captioned': 'videoRecording'
}

FIELD_MAPPINGS = {
    #Not including common fields that have multiple values
    # contributor -> creator
    # tags -> tags
    # subject -> tags
    'journalArticle': { 
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'isPartOf': 'publicationTitle',
        'language': 'language',
        'abstract': 'abstractNote',
        'pagination': 'pages',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'book': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'language': 'language',
        'abstract': 'abstractNote',
        'publisher': 'publisher',
        'edition': 'edition',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'bookSection': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'isPartOf': 'bookTitle',
        'language': 'language',
        'abstract': 'abstractNote',
        'pagination': 'pages',
        'publisher': 'publisher',
        'edition': 'edition',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'conferencePaper': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'isPartOf': 'proceedingsTitle',
        'language': 'language',
        'abstract': 'abstractNote',
        'pagination': 'pages',
        'publisher': 'publisher',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'report': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'language': 'language',
        'abstract': 'abstractNote',
        'publisher': 'institution',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'thesis': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'language': 'language',
        'abstract': 'abstractNote',
        'publisher': 'university',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'artwork': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'language': 'language',
        'abstract': 'abstractNote',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'map': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'language': 'language',
        'abstract': 'abstractNote',
        'publisher': 'publisher',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'computerProgram': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'abstract': 'abstractNote',
        'publisher': 'company',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'document': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'language': 'language',
        'abstract': 'abstractNote',
        'publisher': 'publisher',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'manuscript': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'language': 'language',
        'abstract': 'abstractNote',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'audioRecording': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'isPartOf': 'seriesTitle',
        'language': 'language',
        'abstract': 'abstractNote',
        'publisher': 'label',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'videoRecording': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'isPartOf': 'seriesTitle',
        'language': 'language',
        'abstract': 'abstractNote',
        'publisher': 'studio',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'encyclopediaArticle': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'isPartOf': 'encyclopediaTitle',
        'language': 'language',
        'abstract': 'abstractNote',
        'publisher': 'publisher',
        'repository': 'archive',
        'rights': 'rights',
        'source': 'libraryCatalog'
    },
    'webpage': {
        'title': 'title',
        'troveUrl': 'url',
        'issued': 'date',
        'isPartOf': 'websiteTitle',
        'language': 'language',
        'abstract': 'abstractNote',
        'rights': 'rights'
    }
}

In [2]:

def guess_zotero_type(item_type):
    '''
    Check mappings to try and find a zotero type.
    '''
    if isinstance(item_type, list):
        item_type = item_type[0]
    try:
        zotero_type = TROVE_ZOTERO_MAPPINGS[item_type]
    except KeyError:
        zotero_type = 'journalArticle'
    return zotero_type


def process_name(name):
    '''
    Try and do some cleaning of names that have dates included.
    '''
    parts = name.split(',')
    if len(parts) > 1:
        family_name = parts[0]
        other_names = parts[1]
    else:
        family_name = parts[0]
        other_names = ''
    return {'family_name': family_name, 'other_names': other_names}


def extract_filename_from_url(url):
    '''
    Try and get the filename of attachments.
    '''
    filename = os.path.basename(urlsplit(url).path)
    if not re.search(r'\.jpg|\.gif|\.png|\.tif\.pdf', filename):
        filename = None
    return filename


def prepare_attachment(url, default):
    '''
    Save a local copy of attachment, and return the local file path.
    '''
    response = requests.get(url)
    filename = extract_filename_from_url(url)
    if not filename:
        filename = default
    folder = tempfile.gettempdir()
    filename = os.path.join(folder, filename)
    with open(filename, 'wb') as attachment:
        attachment.write(response.content)
    return filename


def prepare_tags(tags):
    '''
    Takes a list of tags and formats in the object format expected by Zotero.
    '''
    return [{'tag': tag} for tag in tags]


def get_newspaper_pdf(article_id):
    '''
    Use my proxy app to get the url to the PDF copy of an article.
    '''
    response = requests.get('https://trove-proxy.herokuapp.com/pdf/{}'.format(article_id))
    return response.text


def create_zotero_object(zotero_api, trove_api, record):
    '''
    Process Trove record to populate fields in Zotero item template.
    '''
    attachments = []
    item_type = list(record.keys())[0]
    item = record[item_type]
    if item_type == 'work':
        zotero_type = guess_zotero_type(item['type'])
        zotero_template = zotero_api.item_template(zotero_type)
        template = copy.deepcopy(zotero_template)
        work = trove_api.get_item(item_id=item['id'], item_type='work')
        details = work.get_details()
        fields = FIELD_MAPPINGS[zotero_type]
        for t_field, z_field in fields.items():
            if t_field in details:
                template[z_field] = '; '.join(details[t_field])
        if 'contributor' in details:
            for index, contributor in enumerate(details['contributor']):
                names = process_name(contributor)
                template['creators'][index]['firstName'] = names['other_names']
                template['creators'][index]['lastName'] = names['family_name']
        tags = work.get_all_tags()
        if 'subject' in details:
            tags.extend(details['subject'])
        if tags:
            template['tags'] = prepare_tags(tags)
        source = work.get_repository()
        repository = None
        if source['nuc']:
            contributor = trove_api.get_item(item_id=source['nuc'], item_type='contributor')
            repository = contributor.get_title()
        elif source['repository']:
            repository = source['repository']
        if repository:
            template['archive'] = repository
        urls = work.get_urls()
        if 'mediumresolution' in urls:
            image_url = urls['mediumresolution']
        elif 'thumbnail' in urls:
            image_url = urls['thumbnail']
        else:
            image_url = None
        if image_url:
            attachments.append(prepare_attachment(image_url, 'image.jpg'))
        pdf_url = work.get_pdf_url()
        if pdf_url:
            attachments.append(prepare_attachment(pdf_url, 'article.pdf'))
        
    elif item_type == 'people':
        zotero_type = 'encyclopediaArticle'
        template = zotero_api.item_template(zotero_type)
        template['title'] = 'Trove party record'
        template['url'] = item['troveUrl']

    elif item_type == 'article':
        zotero_type = 'newspaperArticle'
        template = zotero_api.item_template(zotero_type)
        template['title'] = item['heading']
        template['url'] = 'http://nla.gov.au/nla.news-article' + item['id']
        template['publicationTitle'] = item['title']['value']
        template['pages'] = item['page']
        template['date'] = item['date']
        pdf_url = get_newspaper_pdf(item['id'])
        attachments.append(prepare_attachment(pdf_url, 'article-{}.pdf'.format(item['id'])))

    elif item_type == 'externalWebsite':
        zotero_type = 'webpage'
        template = zotero_api.item_template(zotero_type)
        template['title'] = item['title']
        template['url'] = item['identifier']['value']

    if template['itemType'] != 'webpage':
        template['libraryCatalog'] = 'Trove'
    template['accessDate'] = datetime.datetime.now().date().isoformat()
    return {'zotero_item': template, 'attachments': attachments}

def create_zotero_collection(zotero_api, collection_name):
    '''
    Creates a Zotero collection with the given name,
    then retrieves the key for that collection.
    '''
    collection_key = None
    if collection_name:
        created = zotero_api.create_collections([{'name': collection_name}])
        if created:
            collections = zotero_api.collections(q=collection_name)
            try:
                collection_key = collections[0]['key']
            except (IndexError, KeyError):
                print('Error retrieving collection key.')
        else:
            print('Error creating collection.')
    return collection_key

def check_duplicate_collection(zotero_api, collection_name):
    '''
    Check to see if a collection with the supplied name already exists.
    '''
    collections = zotero_api.collections(q=collection_name)
    if collections:
        collection_key = collections[0]['key']
    else:
        collection_key = None
    return collection_key

def export_list(list_id, zotero_api, trove_api):
    trove_list = trove_api.get_item(item_id=list_id, item_type='list')
    list_name = '{} (Trove list: {})'.format(trove_list.get_title(), list_id)
    print(list_name)
    collection_key = check_duplicate_collection(zotero_api, list_name)
    if not collection_key:
        collection_key = create_zotero_collection(zotero_api, list_name)
    if collection_key:
        for item in trove_list.list_items:
            if 'deleted' not in item:
                details = create_zotero_object(zotero_api, trove_api, item)
                zotero_item = details['zotero_item']
                zotero_item['collections'] = [collection_key]
                response = zotero_api.create_items([zotero_item])
                # print(response)
                if details['attachments']:
                    zotero_api.attachment_simple(details['attachments'], response['successful']['0']['key'])
                print('New item added: {}'.format(zotero_item['title']))

Add your details here¶

In [3]:

# This is a temporary group id created for demo purposes, replace it with your own personal or group library id
zotero_library_id = '2315662'
# Type should be either 'user' or 'group'
zotero_library_type = 'group' # either 'user' or 'group'
# This is a temporary key created for demonstration purposes
zotero_library_key = 'zUjtvoJwuUNTwxKGIbj6t8wt'

zot_api = zotero.Zotero(zotero_library_id, zotero_library_type, zotero_library_key)

In [4]:

# Replace this with the id of the list you want to export
trove_list_id = '83777'
# This is a temporary key created for demonstration purposes, replace it with your own
trove_api_key = 'ju3rgk0jp354ikmh'

trove_api = Trove(trove_api_key)

Start the export¶

In [ ]:

export_list(trove_list_id, zot_api, trove_api)

In [ ]: