In [1]:

series = 'A9626'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series A9626

Photographic material (including photocopies of photographs) created by ASIO

Total items	1,075
Access status
Open	792 (73.67%)
Open with exception	277 (25.77%)
Not yet examined	6 (0.56%)
Number of items digitised	570 (53.02%)
Number of pages digitised	9,370
Date of earliest content	1919
Date of latest content	1998

Content preview¶

In [5]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[5]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status	digitised_pages
0	1188387	A9626	1	ASIO surveillance photograph of James Frederick Hill [former Department of External Affairs diplomat]	circa1950 - circa1950	NaT	NaT	Open	Canberra	False	0
1	1188390	A9626	2	ASIO surveillance photograph of James Frederick Hill [former Department of External Affairs diplomat]	circa1950 - circa1950	NaT	NaT	Open	Canberra	False	0
2	1188393	A9626	3	ASIO surveillance photograph of James Frederick Hill [former Department of External Affairs diplomat]	circa1950 - circa1950	NaT	NaT	Open	Canberra	True	1
3	1188395	A9626	4	ASIO surveillance photograph of James Frederick Hill [former Department of External Affairs diplomat]	circa1950 - circa1950	NaT	NaT	Open	Canberra	False	0
4	1188399	A9626	5	ASIO photograph of John Wear Burton [former Secretary of Department of External Affairs]	circa1950 - circa1950	NaT	NaT	Open	Canberra	False	0

Plot content dates¶

In [6]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [7]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [8]:

series_details.display_word_counts(title_text)

Out[8]:

	word	count
0	asio	246
1	surveillance	243
2	photograph	237
174	robinson	161
198	aka	100
11	john	91
3	james	70
30	william	67
173	albert	56
175	eva	51
101	march	43
440	number	41
187	george	39
80	may	33
178	max	31
82	sydney	30
146	demonstration	29
208	robert	28
121	david	28
179	ernest	28
162	photographs	28
131	doreen	28
81	day	26
195	nee	25
153	canberra	23

In [9]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	asio surveillance	237
1	surveillance photograph	228
2	photograph of	208
3	robinson asio	119
4	albert robinson	51
5	eva robinson	50
6	of eva	48
7	of albert	48
8	william robinson	29
9	photograph james	28
10	james william	28
11	ernest robinson	25
12	max ernest	25
13	of max	25
14	may day	24
15	day march	23
16	at the	15
17	scan of	13
18	resolution scan	13
19	high resolution	13
20	of page	13
21	doreen burrow	12
22	of doreen	12
23	burrow at	12
24	zangalis george	11

In [ ]: