In [2]:

series = 'A9108'

In [3]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [4]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [5]:

series_details.display_summary(series, df)

National Archives of Australia: Series A9108

'HQ Miscellaneous Files' [Headquarters microfilm of Investigation Branch, Commonwealth Investigation Service and ASIO files]

Total items	691
Access status
Open with exception	465 (67.29%)
Open	220 (31.84%)
Closed	4 (0.58%)
Not yet examined	2 (0.29%)
Number of items digitised	107 (15.48%)
Number of pages digitised	9,810
Date of earliest content	1920
Date of latest content	1967

Content preview¶

In [6]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[6]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status	digitised_pages
0	506074	A9108	ROLL 20/28	Communist meetings in Western Australia	1945 - 1949	1945-01-01 00:00:00	1949-01-01 00:00:00	Open	Canberra	False	0
1	1353596	A9108	ROLL 20/39	Netherlands East Indies - Military detention camp - Casino [176pp]	1945 - 1947	1945-01-01 00:00:00	1947-01-01 00:00:00	Open	Canberra	True	199
2	1353598	A9108	ROLL 20/45	Chinese communists - General activities in Australia	1948 - 1951	1948-01-01 00:00:00	1951-01-01 00:00:00	Open with exception	Canberra	True	145
3	1353599	A9108	ROLL 20/27	Communistic direct action and demonstration in South Australia	1949 - 1949	1949-01-01 00:00:00	1949-01-01 00:00:00	Open with exception	Canberra	False	0
4	1353600	A9108	ROLL 20/46	Chinese Consulate - Activities	1950 - 1951	1950-01-01 00:00:00	1951-01-01 00:00:00	Open with exception	Canberra	True	9

Plot content dates¶

In [7]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [8]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [9]:

series_details.display_word_counts(title_text)

Out[9]:

	word	count
0	communist	147
256	pages	139
51	party	133
101	australian	127
3	australia	83
356	0.5cm	70
352	1cm	60
15	activities	39
22	communism	32
1053	ship	31
20	south	28
743	0.25cm	28
655	2cm	25
64	organisations	23
156	1952	23
78	new	22
23	trade	21
320	committee	20
639	nazi	20
578	movement	18
260	4	18
310	service	18
12	chinese	18
63	association	17
342	league	17

In [10]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	communist party	116
1	australian communist	99
2	0.5cm australian	30
3	in australia	26
4	communism in	25
5	south australia	21
6	of australia	21
7	party of	19
8	pages australian	18
9	4 pages	18
10	activities in	17
11	8 pages	15
12	in the	15
13	in south	13
14	2 pages	13
15	for peace	12
16	pty ltd	12
17	trade unions	12
18	1cm australian	12
19	5 pages	11
20	7 pages	11
21	10 pages	11
22	in trade	11
23	in victoria	11
24	3 pages	10

In [ ]: