In [1]:

series = 'A6283'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series A6283

Correspondence files, multiple number series (Royal Commission Section)

Total items	256
Access status
Open with exception	208 (81.25%)
Not yet examined	24 (9.38%)
Open	21 (8.20%)
Closed	3 (1.17%)
Number of items digitised	23 (8.98%)
Number of pages digitised	3,352
Date of earliest content	1800
Date of latest content	1959

Content preview¶

In [5]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[5]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status
0	241384	A6283	18	Communist Party of Australia	1954 - 1955	1954-01-01 00:00:00	1955-01-01 00:00:00	Open with exception	Canberra	False
1	241385	A6283	19	Voks	1954 - 1955	1954-01-01 00:00:00	1955-01-01 00:00:00	Open with exception	Canberra	False
2	241388	A6283	24	Australian Government Policies	1954 - 1954	1954-01-01 00:00:00	1954-01-01 00:00:00	Open with exception	Canberra	False
3	241389	A6283	25	Australian Government Departments and Instrumentalities	1954 - 1955	1954-01-01 00:00:00	1955-01-01 00:00:00	Open with exception	Canberra	False
4	241390	A6283	27	Corrective Labour Camps	1954 - 1954	1954-01-01 00:00:00	1954-01-01 00:00:00	Open with exception	Canberra	False

Plot content dates¶

In [6]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [7]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [8]:

series_details.display_word_counts(title_text)

Out[8]:

	word	count
183	copy	96
182	reference	96
33	volume	55
40	petrov	50
35	vol	41
49	vladimir	37
78	royal	29
47	press	28
38	5	27
79	commission	27
29	interrogations	27
48	cuttings	26
30	petrovs	25
41	1	23
46	2	22
50	mihailovich	21
82	mvd	21
56	evdokia	20
42	overseas	19
28	reports	19
67	intelligence	17
68	services	17
52	story	16
226	mail	16
228	september	16

In [9]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	reference copy	96
1	of interrogations	27
2	royal commission	27
3	press cuttings	26
4	of petrovs	24
5	cuttings volume	24
6	petrov vladimir	22
7	volume 5	22
8	vladimir mihailovich	21
9	interrogations of	20
10	overseas intelligence	17
11	intelligence services	17
12	5 'the	16
13	reports of	16
14	'the courier	14
15	by vladimir	14
16	evdokia petrov	14
17	courier mail	14
18	empire of	13
19	the empire	13
20	vladimir and	13
21	and evdokia	13
22	of fear	13
23	1955 p2	12
24	vol 1	11

In [ ]: