#!/usr/bin/env python # coding: utf-8 # # Summarise series data # # In this notebook we'll summarise data from all the harvested series. # In[1]: import os import pandas as pd from IPython.core.display import display, HTML import series_details # In[152]: # This is a list of all the series harvested as part of this repository series_list = ['B13', 'B6003', 'BP343/15', 'D2860', 'D5036', 'D596', 'E752', 'J2481', 'J2482', 'J2483', 'J3115', 'K1145', 'P437', 'P526', 'PP4/2', 'PP6/1', 'SP11/26', 'SP11/6', 'SP115/1', 'SP115/10', 'SP42/1', 'SP726/1', 'ST84/1'] # ## Aggregate all the series data into a single dataframe # # Let's combine summaries of all the harvested series into a single dataframe so we can look at the big picture. # In[ ]: # Create a list to store the summaries summaries = [] # Loop through the list of series in this repo for series in series_list: # Open the CSV of each series harvest as a data frame df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date']) # Extract a summary of each series and add it to the list of summaries summaries.append(series_details.make_summary(series, df, include_titles=False)) # Convert the list of summaries into a DataFrame for easy manipulation df = pd.DataFrame(summaries) # Flatten the access count dictionaries and fill blanks with zero df = pd.concat([df, pd.DataFrame((d for idx, d in df['access_counts'].iteritems()))], axis=1).fillna(0) # Change access counts from floats to integers df[['Closed', 'Not yet examined', 'Open with exception', 'Open']] = df[['Closed', 'Not yet examined', 'Open with exception', 'Open']].astype(int) # Delete the old 'access_counts' column del df['access_counts'] # For convenience acronymise 'Not yet examined' and 'Open with exception' df.rename({'Not yet examined': 'NYE', 'Open with exception': 'OWE'}, axis=1, inplace=True) # ## Display a summary table of all series # # Let's display a summary of each series in a nicely formatted table. # In[153]: # Get the columns into the order we want df = df[['series', 'total_items', 'date_from', 'date_to', 'Open', 'OWE', 'NYE', 'Closed', 'digitised_files', 'digitised_pages']] # Calculate and add a percentage open column df['% open'] = df['Open'] / df['total_items'] # Calculate and add a percentage digitised column df['% digitised'] = df['digitised_files'] / df['total_items'] # Add a link to the series name df['series'] = df['series'].apply(lambda x: '{}'.format(x.replace('/', '-'), x)) # Style the output (df.style .set_properties(**{'font-size': '120%'}) .set_properties(subset=['series'], **{'text-align': 'left', 'font-weight': 'bold'}) .format('{:,}', ['total_items', 'Open', 'OWE', 'NYE', 'Closed', 'digitised_files', 'digitised_pages']) .format('{:.2%}', ['% open', '% digitised']) # Hide the index .set_table_styles([dict(selector="th", props=[("font-size", "120%"), ("text-align", "center")]), dict(selector='.row_heading, .blank', props=[('display', 'none')])]) .background_gradient(cmap='Greens', subset=['% open', '% digitised'], high=0.5) ) # ## Calculate some aggregate data # # Now let's calculate some aggregate data across the complete set of series harvests. # In[155]: total_items = df['total_items'].sum() print('Total items: {:,}'.format(total_items)) # In[156]: date_from = df['date_from'].min() print('Earliest date: {}'.format(date_from)) # In[157]: date_to = df['date_to'].max() print('Latest date: {}'.format(date_to)) # In[161]: access_status = {} total = df['total_items'].sum() print('Access status:\n') for status in ['Open', 'OWE', 'NYE', 'Closed']: status_total = df[status].sum() access_status[status] = status_total print(' {: <10} {: <10,} ({:.2%})'.format(status, status_total, status_total/total)) # In[162]: digitised_files = df['digitised_files'].sum() print('Total files digitised: {:,}'.format(digitised_files)) # In[163]: digitised_pages = df['digitised_pages'].sum() print('Total pages digitised: {:,}'.format(digitised_pages)) # In[ ]: