In this notebook we'll summarise data from all the harvested series.
import os
import pandas as pd
from IPython.core.display import display, HTML
import series_details
# This is a list of all the series harvested as part of this repository
series_list = ['B13', 'B6003', 'BP343/15', 'D2860', 'D5036', 'D596', 'E752', 'J2481', 'J2482', 'J2483', 'J3115', 'K1145', 'P437', 'P526', 'PP4/2', 'PP6/1', 'SP11/26', 'SP11/6', 'SP115/1', 'SP115/10', 'SP42/1', 'SP726/1', 'ST84/1']
Let's combine summaries of all the harvested series into a single dataframe so we can look at the big picture.
# Create a list to store the summaries
summaries = []
# Loop through the list of series in this repo
for series in series_list:
# Open the CSV of each series harvest as a data frame
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
# Extract a summary of each series and add it to the list of summaries
summaries.append(series_details.make_summary(series, df, include_titles=False))
# Convert the list of summaries into a DataFrame for easy manipulation
df = pd.DataFrame(summaries)
# Flatten the access count dictionaries and fill blanks with zero
df = pd.concat([df, pd.DataFrame((d for idx, d in df['access_counts'].iteritems()))], axis=1).fillna(0)
# Change access counts from floats to integers
df[['Closed', 'Not yet examined', 'Open with exception', 'Open']] = df[['Closed', 'Not yet examined', 'Open with exception', 'Open']].astype(int)
# Delete the old 'access_counts' column
del df['access_counts']
# For convenience acronymise 'Not yet examined' and 'Open with exception'
df.rename({'Not yet examined': 'NYE', 'Open with exception': 'OWE'}, axis=1, inplace=True)
Let's display a summary of each series in a nicely formatted table.
# Get the columns into the order we want
df = df[['series', 'total_items', 'date_from', 'date_to', 'Open', 'OWE', 'NYE', 'Closed', 'digitised_files', 'digitised_pages']]
# Calculate and add a percentage open column
df['% open'] = df['Open'] / df['total_items']
# Calculate and add a percentage digitised column
df['% digitised'] = df['digitised_files'] / df['total_items']
# Add a link to the series name
df['series'] = df['series'].apply(lambda x: '<a href="{}-summary.ipynb">{}</a>'.format(x.replace('/', '-'), x))
# Style the output
(df.style
.set_properties(**{'font-size': '120%'})
.set_properties(subset=['series'], **{'text-align': 'left', 'font-weight': 'bold'})
.format('{:,}', ['total_items', 'Open', 'OWE', 'NYE', 'Closed', 'digitised_files', 'digitised_pages'])
.format('{:.2%}', ['% open', '% digitised'])
# Hide the index
.set_table_styles([dict(selector="th", props=[("font-size", "120%"), ("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
.background_gradient(cmap='Greens', subset=['% open', '% digitised'], high=0.5)
)
series | total_items | date_from | date_to | Open | OWE | NYE | Closed | digitised_files | digitised_pages | % open | % digitised | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | B13 | 20,194 | 1800 | 2005 | 19,786 | 8 | 400 | 0 | 354 | 5,043 | 97.98% | 1.75% |
1 | B6003 | 3 | 1904 | 1959 | 3 | 0 | 0 | 0 | 0 | 0 | 100.00% | 0.00% |
2 | BP343/15 | 2,571 | 1916 | 1955 | 2,566 | 0 | 5 | 0 | 85 | 176 | 99.81% | 3.31% |
3 | D2860 | 1 | 1902 | 1957 | 0 | 1 | 0 | 0 | 0 | 0 | 0.00% | 0.00% |
4 | D5036 | 1 | 1906 | 1935 | 1 | 0 | 0 | 0 | 0 | 0 | 100.00% | 0.00% |
5 | D596 | 11,395 | 1871 | 1971 | 2,983 | 31 | 8,381 | 0 | 185 | 3,031 | 26.18% | 1.62% |
6 | E752 | 722 | 1905 | 1941 | 719 | 0 | 3 | 0 | 717 | 9,310 | 99.58% | 99.31% |
7 | J2481 | 858 | 1897 | 1903 | 858 | 0 | 0 | 0 | 858 | 2,031 | 100.00% | 100.00% |
8 | J2482 | 799 | 1902 | 1912 | 799 | 0 | 0 | 0 | 798 | 3,153 | 100.00% | 99.87% |
9 | J2483 | 14,438 | 1903 | 1956 | 14,436 | 0 | 2 | 0 | 14,436 | 79,210 | 99.99% | 99.99% |
10 | J3115 | 161 | 1899 | 1928 | 161 | 0 | 0 | 0 | 161 | 1,344 | 100.00% | 100.00% |
11 | K1145 | 4,816 | 1900 | 1955 | 4,791 | 0 | 25 | 0 | 175 | 874 | 99.48% | 3.63% |
12 | P437 | 4,958 | 1901 | 1940 | 4,945 | 10 | 2 | 1 | 18 | 442 | 99.74% | 0.36% |
13 | P526 | 2 | 1909 | 1918 | 1 | 0 | 1 | 0 | 0 | 0 | 50.00% | 0.00% |
14 | PP4/2 | 613 | 1903 | 1947 | 610 | 0 | 3 | 0 | 28 | 1,512 | 99.51% | 4.57% |
15 | PP6/1 | 6,010 | 1906 | 1978 | 1,863 | 33 | 4,109 | 5 | 245 | 6,461 | 31.00% | 4.08% |
16 | SP11/26 | 27 | 1902 | 1902 | 27 | 0 | 0 | 0 | 5 | 84 | 100.00% | 18.52% |
17 | SP11/6 | 191 | 1902 | 1947 | 101 | 0 | 90 | 0 | 1 | 323 | 52.88% | 0.52% |
18 | SP115/1 | 1,787 | 1884 | 1943 | 1,787 | 0 | 0 | 0 | 9 | 285 | 100.00% | 0.50% |
19 | SP115/10 | 6 | 1884 | 1888 | 6 | 0 | 0 | 0 | 0 | 0 | 100.00% | 0.00% |
20 | SP42/1 | 16,256 | 1881 | 1960 | 15,525 | 0 | 731 | 0 | 3,253 | 45,862 | 95.50% | 20.01% |
21 | SP726/1 | 6 | 1902 | 1959 | 6 | 0 | 0 | 0 | 0 | 0 | 100.00% | 0.00% |
22 | ST84/1 | 2,765 | 1855 | 1975 | 2,758 | 0 | 7 | 0 | 434 | 13,979 | 99.75% | 15.70% |
Now let's calculate some aggregate data across the complete set of series harvests.
total_items = df['total_items'].sum()
print('Total items: {:,}'.format(total_items))
Total items: 88,580
date_from = df['date_from'].min()
print('Earliest date: {}'.format(date_from))
Earliest date: 1800
date_to = df['date_to'].max()
print('Latest date: {}'.format(date_to))
Latest date: 2005
access_status = {}
total = df['total_items'].sum()
print('Access status:\n')
for status in ['Open', 'OWE', 'NYE', 'Closed']:
status_total = df[status].sum()
access_status[status] = status_total
print(' {: <10} {: <10,} ({:.2%})'.format(status, status_total, status_total/total))
Access status: Open 74,732 (84.37%) OWE 83 (0.09%) NYE 13,759 (15.53%) Closed 6 (0.01%)
digitised_files = df['digitised_files'].sum()
print('Total files digitised: {:,}'.format(digitised_files))
Total files digitised: 21,762
digitised_pages = df['digitised_pages'].sum()
print('Total pages digitised: {:,}'.format(digitised_pages))
Total pages digitised: 173,120