This notebook brings together annual harvests of files with an access status of 'closed', scraped from the NAA's RecordSearch database. The data files are here:
The current code used to harvest 'closed' files is in this notebook. Previous versions can be found in this repository.
import pandas as pd
from pathlib import Path
import altair as alt
harvests = {
'2015': 'closed-20160101.csv',
'2016': 'closed-20170109.csv',
'2017': 'closed-20180101.csv',
'2018': 'closed-20190101.csv',
'2019': 'closed-20200101.csv',
'2020': 'closed-20210101.csv'
}
# Load all the data into a single dataframe
dfs = []
for year, data_file in harvests.items():
df_year = pd.read_csv(Path('data', data_file), parse_dates=['contents_start_date', 'contents_end_date', 'access_decision_date'], keep_default_na=False)
df_year['harvested_year'] = year
dfs.append(df_year)
df = pd.concat(dfs)
df.head()
year_counts = df['harvested_year'].value_counts().to_frame().reset_index()
year_counts.columns = ['year', 'count']
year_counts.sort_values(by='year')
alt.Chart(year_counts).mark_bar(point=True).encode(
x=alt.X('year:O', title='Year end'),
y=alt.Y('count:Q', title='Number of closed files'),
color=alt.Color('year', legend=None),
tooltip=['year:O', 'count:Q']
).properties(width=300)
df_reasons = df.copy()
df_reasons['reason'] = df_reasons['reasons'].str.split('|')
df_reasons = df_reasons.explode('reason')
df_reasons['reason'].replace('', 'No reason', inplace=True)
unique_reasons = sorted(list(df_reasons['reason'].unique()))
unique_reasons
harvest_reasons_counts = df_reasons.groupby(by=['harvested_year', 'reason']).size().reset_index()
harvest_reasons_counts.columns = ['year', 'reason', 'count']
alt.Chart(harvest_reasons_counts).mark_bar().encode(
x=alt.X('year:O', title=None),
y=alt.Y('count:Q', title='Number of files'),
color=alt.Color('year:N', legend=None),
facet=alt.Facet('reason:O', align='each', columns=5, title='Reason for being closed'),
tooltip=['year:O', 'reason:N', 'count:Q']
).properties(height=200).resolve_scale(
x='independent'
)
Select a reason from the dropdown list to examine change over time.
input_dropdown = alt.binding_select(options=[None] + unique_reasons, labels=['All'] + unique_reasons)
selection = alt.selection_single(fields=['reason'], bind=input_dropdown, name='Select')
alt.Chart(harvest_reasons_counts).mark_bar().encode(
x=alt.X('year:O', title=None),
y=alt.Y('count:Q', title='Number of files'),
color=alt.Color('year:N', legend=None),
column=alt.Column('reason:N', title='Reason for being closed'),
tooltip=['year:O', 'reason:N', 'count:Q']
).add_selection(
selection
).transform_filter(
selection
).properties(
height=200
).resolve_scale(
x='independent'
)