#!/usr/bin/env python # coding: utf-8 # # Exploring object records # In this notebook we'll have a preliminary poke around in the `object` data harvested from the [NMA Collection API](https://www.nma.gov.au/about/our-collection/our-apis). I'll focus here on the basic shape/stats of the data, other notebooks will explore the object data over [time](explore_collection_object_over_time.ipynb) and [space](explore_objects_and_places.ipynb). # # If you haven't already, you'll either need to [harvest the `object` data](harvest_records.ipynb), or [unzip a pre-harvested dataset](unzip_preharvested_data.ipynb). # # * [The shape of the data](#The-shape-of-the-data) # * [Nested data](#Nested-data) # * [The `additionalType` field](#The-additionalType-field) # * [The `extent` field](#The-extent-field) # * [How big is the collection?](#How-big-is-the-collection?) # * [The biggest object?](#The-biggest-object?) # #
#

If you haven't used one of these notebooks before, they're basically web pages in which you can write, edit, and run live code. They're meant to encourage experimentation, so don't feel nervous. Just try running a few cells and see what happens!

# #

# Some tips: #

#

# #

Is this thing on? If you can't edit or run any of the code cells, you might be viewing a static (read only) version of this notebook. Click here to load a live version running on Binder.

# #
# ## Import what we need # In[23]: import pandas as pd import math from IPython.display import display, HTML, FileLink from tinydb import TinyDB, Query from pandas import json_normalize # ## Load the harvested data # In[2]: # Load the harvested data from the json db db = TinyDB('nma_object_db.json') records = db.all() Object = Query() # In[3]: # Convert to a dataframe df = pd.DataFrame(records) df.head() # ## The shape of the data # How many objects are there? # In[4]: print('There are {:,} objects in the collection'.format(df.shape[0])) # Obviously not every record has a value for every field, let's create a quick count of the number of values in each field. # In[5]: df.count() # Let's express those counts as a percentage of the total number of records, and display them as a bar chart using Pandas. # In[6]: # Get field counts and convert to dataframe field_counts = df.count().to_frame().reset_index() # Change column headings field_counts.columns = ['field', 'count'] # Calculate proportion of the total field_counts['proportion'] = field_counts['count'].apply(lambda x: x / df.shape[0]) # Style the results as a barchart field_counts.style.bar(subset=['proportion'], color='#d65f5f').format({'proportion': '{:.2%}'.format}) # ## Nested data # One thing you might note is that some of the fields contain nested JSON arrays or objects. For example `additionalType` contains a list of object types, while `extent` is a dictionary with keys and values. Let's unpack these columns for the second row (index of 1). # In[7]: df['additionalType'][1][0] # In[8]: df['extent'][1] # In[9]: df['extent'][1]['length'] # ## The `additionalType` field # How many objects have values in the `additionalType` column? # In[10]: df.loc[df['additionalType'].notnull()].shape # In[11]: print('{:%} of objects have an additionalType value'.format(df.loc[df['additionalType'].notnull()].shape[0] / df.shape[0])) # So which ones don't have an `additionalType`? # In[12]: # Just show the first 5 rows df.loc[df['additionalType'].isnull()].head() # How many rows have more than one `additionalType`? # In[13]: df.loc[df['additionalType'].str.len() > 1].shape[0] # Let's have a look at a sample. # In[14]: df.loc[df['additionalType'].str.len() > 1].head() # The `additionalType` field contains a nested list of values. Using `json_normalize()` or `explode()` we can explode these lists, creating a row for each separate value. # In[15]: # Use json_normalize to expand 'additionalType' into separate rows, adding the id and title from the parent record # df_types = json_normalize(df.loc[df['additionalType'].notnull()].to_dict('records'), record_path='additionalType', meta=['id', 'title'], errors='ignore').rename({0: 'additionalType'}, axis=1) # In pandas v.0.25 and above you can just use explode -- this prodices the same result as above df_types = df.loc[df['additionalType'].notnull()][['id', 'title', 'additionalType']].explode('additionalType') df_types.head() # Now that we've exploded the type values, we can aggregate them in different ways. Let's look at the 25 most common object types! # In[16]: df_types['additionalType'].value_counts()[:25] # How many object types only appear once? # In[17]: type_counts = df_types['additionalType'].value_counts().to_frame().reset_index().rename({'index': 'type', 'additionalType': 'count'}, axis=1) unique_types = type_counts.loc[type_counts['count'] == 1] unique_types.shape[0] # In[18]: unique_types.head() # Let's save the complete list of types as a CSV file. # In[19]: type_counts.to_csv('nma_object_type_counts.csv', index=False) display(FileLink('nma_object_type_counts.csv')) # Browsing the CSV I noticed that there was one item with the type `Vegetables`. Let's find some more out about it. # In[20]: # Find in the complete data set mask = df.loc[df['additionalType'].notnull()]['additionalType'].apply(lambda x: 'Vegetables' in x) veggie = df.loc[df['additionalType'].notnull()][mask] veggie # We can create a link into the NMA Collections Explorer using the object `id`. # In[21]: display(HTML('{}'.format(veggie.iloc[0]['id'], veggie.iloc[0]['title']))) # Does a toad stool count as a vegetable? # ## The `extent` field # # The `extent` field is a nested object, so once again we'll use `json_normalize()` to expand it out into separate columns. # In[24]: # Without reset_index() the rows are misaligned df_extent = df.loc[df['extent'].notnull()].reset_index().join(json_normalize(df.loc[df['extent'].notnull()]['extent'].tolist()).add_prefix("extent_")) df_extent.head() # Let's check to see what types of things are in the `extent` field. # In[25]: df_extent['extent_type'].value_counts() # So they're all measurements. Let's have a look at the units being used. # In[26]: df_extent['extent_unitText'].value_counts() # In[27]: df_extent['extent_unitTextWeight'].value_counts() # Hmmm, are those measurements really in metres, or might they be meant to be 'mm'? Let's have a look at them. # In[28]: df_extent.loc[df_extent['extent_unitText'] == 'm'][['id', 'title', 'extent_length', 'extent_width', 'extent_unitText']] # Other than 'Gunter's chain' it looks like the unit should indeed by 'mm'. We'll need to take that into account in calculations. # # Now let's convert all the measurements into a single unit – millimetre for lengths, and gram for weights. # In[29]: def conversion_factor(unit): ''' Get the factor required to convery current unit to either mm or g. ''' factors = { 'mm': 1, 'cm': 10, 'm': 1, # Most should in fact be mm (see above) 'g': 1, 'kg': 1000, 'tonne': 1000000, 'oz': 28.35, 'lb': 453.592 } try: factor = factors[unit.lower()] except KeyError: factor = 0 return factor def normalise_measurements(row): ''' Convert measurements to standard units. ''' l_factor = conversion_factor(str(row['extent_unitText'])) length = row['extent_length'] * l_factor width = row['extent_width'] * l_factor depth = row['extent_depth'] * l_factor height = row['extent_height'] * l_factor diameter = row['extent_diameter'] * l_factor w_factor = conversion_factor(str(row['extent_unitTextWeight'])) weight = row['extent_weight'] * w_factor return pd.Series([length, width, depth, height, diameter, weight]) # Add normalised measurements to the dataframe df_extent[['length_mm', 'width_mm', 'depth_mm', 'height_mm', 'diameter_mm', 'weight_g']] = df_extent.apply(normalise_measurements, axis=1) # In[30]: df_extent.head() # ## How big is the collection? # In[31]: def calculate_volume(row): ''' Look for 3 linear dimensions and multiply them to get a volume. ''' # Create a list of valid linear measurements from the available fields dimensions = [d for d in [row['length_mm'], row['width_mm'], row['depth_mm'], row['height_mm'], row['diameter_mm']] if not math.isnan(d)] # If there's only 2 dimensions... if len(dimensions) == 2: # Set a default height of 1 for items with only 2 dimensions dimensions.append(1) # If there's 3 or more dimensions, multiple the first 3 together if len(dimensions) >= 3: volume = dimensions[0] * dimensions[1] * dimensions[2] else: volume = 0 return volume df_extent['volume'] = df_extent.apply(calculate_volume, axis=1) # In[32]: print('Total length of objects is {:.2f} km'.format(df_extent['length_mm'].sum() / 1000 / 1000)) # In[33]: print('Total weight of objects is {:.2f} tonnes'.format(df_extent['weight_g'].sum() / 1000000)) # In[34]: print('Total volume of objects is {:.2f} m\N{SUPERSCRIPT THREE}'.format(df_extent['volume'].sum() / 1000000000)) # ## The biggest object? # What's the biggest thing? # In[35]: # Get the object with the largest volume biggest = df_extent.loc[df_extent['volume'].idxmax()] # Create a link to Collection Explorer display(HTML('{}'.format(biggest['id'], biggest['title']))) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). # # Work on this notebook was supported by the [Humanities, Arts and Social Sciences (HASS) Data Enhanced Virtual Lab](https://tinker.edu.au/).