In [1]:

series = 'B2836'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series B2836

Reference material accumulated by agents

Total items	14
Access status
Open	14 (100.00%)
Number of items digitised	3 (21.43%)
Number of pages digitised	375
Date of earliest content	1926
Date of latest content	1972

Content preview¶

In [5]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[5]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status	digitised_pages
0	412483	B2836	GROUP 62	Peace Publications (Three)	1950 - 1951	1950-01-01 00:00:00	1951-01-01 00:00:00	Open	Melbourne	False	0
1	412493	B2836	GROUP 64	Publications of the Australia - Soviet Friendship League (35 Leaflets etc)	1940 - 1944	1940-01-01 00:00:00	1944-01-01 00:00:00	Open	Melbourne	False	0
2	412501	B2836	GROUP 59/4	A.C.P. Publications	1932 - 1952	1932-01-01 00:00:00	1952-01-01 00:00:00	Open	Melbourne	False	0
3	412505	B2836	GROUP 60	CPA Publications and Others (110 Leaflets etc)	circa1952 - 1953	NaT	1953-01-01 00:00:00	Open	Melbourne	True	264
4	412512	B2836	GROUP 53 PART 1	"Workers Star" (Nos 129-163, with Gaps)	1939 - 1939	1939-01-01 00:00:00	1939-01-01 00:00:00	Open	Melbourne	False	0

Plot content dates¶

In [6]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [7]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [8]:

series_details.display_word_counts(title_text)

Out[8]:

	word	count
1	publications	6
8	leaflets	6
9	etc	6
11	cpa	4
15	star	4
3	australia	3
22	issues	3
14	workers	3
27	communist	2
16	nos	2
28	party	2
21	10	2
7	35	2
29	56	1
30	48	1
31	campaign	1
0	peace	1
32	1951	1
26	51	1
34	publictions	1
35	83	1
36	reference	1
37	material	1
38	accumulated	1
33	referendum	1

In [9]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	leaflets etc	6
1	workers star	3
2	35 leaflets	2
3	other publications	2
4	australia and	2
5	etc cpa	2
6	and other	2
7	party of	2
8	star nos	2
9	cpa communist	2
10	communist party	2
11	star 10	2
12	10 issues	2
13	of australia	2
14	48 leaflets	1
15	publictions 83	1
16	of the	1
17	129-163 with	1
18	others 110	1
19	by agents	1
20	material accumulated	1
21	the workers	1
22	friendship league	1
23	campaign 1951	1
24	cpa publictions	1

In [ ]: