In [1]:

series = 'P437'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series P437

Correspondence Files, Annual Single Number Series

Total items	4,958
Access status
Open	4,945 (99.74%)
Open with exception	10 (0.20%)
Not yet examined	2 (0.04%)
Closed	1 (0.02%)
Number of items digitised	18 (0.36%)
Number of pages digitised	442
Date of earliest content	1901
Date of latest content	1940

Download the complete CSV file

Content preview¶

In [4]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[4]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status
0	538211	P437	1940/279	Restricted drugs	1939 - 1940	1939-01-01 00:00:00	1940-01-01 00:00:00	Closed	Hobart	False
1	542152	P437	WHOLE SERIES	Correspondence files of the Collector of Customs, Hobart for the period 1908 to 1940; covers such topics as tariffs, trade, duty, immigration, export permits, patents lighthouse service, ship wrecks, passports,various grant and Bounty schemes	1908 - 1940	1908-01-01 00:00:00	1940-01-01 00:00:00	Open with exception	Hobart	False
2	635923	P437	1910/12	Accounts - postage paid - Board of Trade Journals	1909 - 1910	1909-01-01 00:00:00	1910-01-01 00:00:00	Open	Hobart	False
3	642439	P437	1910/14	Bank guarantees - cancellation of - The Commercial Bank of Tasmania Ltd.	1910 - 1910	1910-01-01 00:00:00	1910-01-01 00:00:00	Open	Hobart	False
4	642442	P437	1910/15	Imports of Fire Arms to Tasmania - January to June 1909. Returns.	1910 - 1910	1910-01-01 00:00:00	1910-01-01 00:00:00	Open	Hobart	False

Plot content dates¶

In [5]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [6]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [7]:

series_details.display_word_counts(title_text)

Out[7]:

	word	count
85	request	508
14	duty	382
5	customs	345
252	act	238
107	return	213
37	tasmania	200
127	goods	196
39	imports	186
219	mr	177
16	export	161
61	application	155
200	company	149
332	ss	135
6	hobart	134
146	import	128
89	forms	127
125	list	121
195	forwarded	118
100	certificate	117
292	launceston	114
141	imported	102
302	beer	102
189	officers	97
212	invoice	95
108	commonwealth	94

In [8]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	request for	354
1	to be	121
2	application for	111
3	of customs	88
4	duty on	83
5	of duty	82
6	certificate of	82
7	of the	76
8	for the	72
9	return of	68
10	and company	66
11	export of	64
12	with the	62
13	commerce act	61
14	import of	61
15	collector of	59
16	list of	59
17	return showing	57
18	imports of	56
19	being forwarded	55
20	trading with	55
21	of exemption	54
22	the enemy	53
23	importation of	51
24	of goods	49

In [ ]: