In [1]:

series = 'A12694'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series A12694

Source material collected for a research project on aspects of the cold war in Australia, single number series

Total items	25
Access status
Open with exception	20 (80.00%)
Open	5 (20.00%)
Number of items digitised	8 (32.00%)
Number of pages digitised	669
Date of earliest content	1965
Date of latest content	1986

Content preview¶

In [5]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[5]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status	digitised_pages
0	7949343	A12694	8	Directors and Regional Directors Conferences from 1970	1970 - 1973	1970-01-01 00:00:00	1973-01-01 00:00:00	Open with exception	Canberra	False	0
1	7949625	A12694	9	Throssell, Richard Prichard Volume 4	1965 - 1969	1965-01-01 00:00:00	1969-01-01 00:00:00	Open with exception	Canberra	False	0
2	7949626	A12694	10	Throssell, Richard Prichard Volume 5	1971 - 1974	1971-01-01 00:00:00	1974-01-01 00:00:00	Open with exception	Canberra	True	70
3	7949627	A12694	11	Policy or directives about the employment of homosexuals Volume 2	1969 - 1969	1969-01-01 00:00:00	1969-01-01 00:00:00	Open with exception	Canberra	False	0
4	7949628	A12694	12	Policy or directives about the employment of homosexuals Volume 3	1970 - 1971	1970-01-01 00:00:00	1971-01-01 00:00:00	Open with exception	Canberra	False	0

Plot content dates¶

In [6]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [7]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [8]:

series_details.display_word_counts(title_text)

Out[8]:

	word	count
7	volume	16
40	papers	6
39	miscellaneous	6
14	2	5
26	1	4
30	soviet	4
31	embassy	4
48	australia	4
32	contact	4
33	members	4
34	parliament	4
28	projects	3
29	branch	3
15	3	3
13	homosexuals	3
27	special	3
12	employment	3
11	directives	3
10	policy	3
8	4	3
61	communist	3
62	party	3
38	intelligence	2
36	briefings	2
67	organisation	2

In [9]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	miscellaneous papers	6
1	volume 2	5
2	contact with	4
3	volume 1	4
4	of australia	4
5	embassy contact	4
6	parliament volume	4
7	members of	4
8	of parliament	4
9	soviet embassy	4
10	with members	4
11	branch volume	3
12	volume 4	3
13	policy or	3
14	the employment	3
15	special projects	3
16	communist party	3
17	projects branch	3
18	or directives	3
19	of homosexuals	3
20	directives about	3
21	homosexuals volume	3
22	employment of	3
23	party of	3
24	volume 3	3

In [ ]: