This notebook extracts a dataset from a digital collection described using MARCXML files, including descriptive metadata from the Moving Image Archive catalogue, which is Scotland’s national collection of moving images.
# import the libraries we need
# https://pypi.org/project/pymarc/
import pymarc, re, csv
import pandas as pd
from pymarc import parse_xml_to_array
from datapackage import Package
csv_out = csv.writer(open('marc_records.csv', 'w'), delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
csv_out.writerow(['title', 'author', 'place_production', 'date', 'extents', 'credits_note', 'subjects', 'summary', 'detail', 'link'])
records = parse_xml_to_array(open('Moving-Image-Archive/Moving-Image-Archive-dataset-MARC.xml'))
for record in records:
title = author = place_production = date = extents = credits_note = subjects = summary = publisher = link =''
# title
if record['245'] is not None:
title = record['245']['a']
if record['245']['b'] is not None:
title = title + " " + record['245']['b']
# determine author
if record['100'] is not None:
author = record['100']['a']
elif record['110'] is not None:
author = record['110']['a']
elif record['700'] is not None:
author = record['700']['a']
elif record['710'] is not None:
author = record['710']['a']
# place_production
if record['264'] is not None:
place_production = record['264']['a']
# date
for f in record.get_fields('264'):
dates = f.get_subfields('c')
if len(dates):
date = dates[0]
# cleaning date last .
if date.endswith('.'): date = date[:-1]
# Physical Description - extent
for f in record.get_fields('300'):
extents = f.get_subfields('a')
if len(extents):
extent = extents[0]
# TODO cleaning
details = f.get_subfields('b')
if len(details):
detail = details[0]
# Creation/production credits note
if record['508'] is not None:
credits_note = record['508']['a']
# Summary
if record['520'] is not None:
summary = record['520']['a']
# subject
if record['653'] is not None:
subjects = ''
for f in record.get_fields('653'):
subjects += f.get_subfields('a')[0] + ' -- '
subjects = re.sub(' -- $', '', subjects)
# link
if record['856'] is not None:
link = record['856']['u']
csv_out.writerow([title,author,place_production,date,extents,credits_note,subjects,summary,detail,link])
Data Package is a simple container format for describing a coherent collection of data in a single 'package'. It provides the basis for convenient delivery, installation and management of datasets. There is a Python library for working with Data Packages.
package = Package()
package.infer('marc_records.csv')
package.descriptor
The Data Package contains the data and the descriptor as a zip file.
package.save('datapackage.zip')
We can also read the CSV file to explore the metadata
# This puts the data in a Pandas DataFrame
df = pd.read_csv('marc_records.csv')
# Let's have a look inside...
# Note that both the columns and rows are truncated in this preview
df
df.columns
len(df)
df['subjects'][2]
df['subjects'].str.split('--', expand=True).stack()
# Get unique values
topics = pd.unique(df['subjects'].str.split(' -- ', expand=True).stack()).tolist()
for topic in sorted(topics, key=str.lower):
print(topic)
# Splits the topic column and counts frequencies
topic_counts = df['subjects'].str.split('--').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0)
# Add column names
topic_counts.columns = ['subject', 'count']
# Display with horizontal bars
display(topic_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}))