In [2]:

series = 'B13'

In [3]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [4]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [5]:

series_details.display_summary(series, df)

National Archives of Australia: Series B13

General and classified correspondence, annual single number series

Total items	20,194
Access status
Open	19,786 (97.98%)
Not yet examined	400 (1.98%)
Open with exception	8 (0.04%)
Number of items digitised	354 (1.75%)
Number of pages digitised	5,043
Date of earliest content	1800
Date of latest content	2005

Download the complete CSV file

Content preview¶

In [6]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[6]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status
0	787258	B13	1924/7516	Charlie Lam Sun (Charlie Shack Mayberry) - Arrived Sydney per "Taiyuan" 15.3.1924	1924 - circa1924	1924-01-01 00:00:00	NaT	Open	Melbourne	False
1	790335	B13	1926/6755	Edward Traynor - permission to enter Australia - arrived per "Beltana" 13.5.1926	1926 - 1926	1926-01-01 00:00:00	1926-01-01 00:00:00	Open	Melbourne	False
2	3280504	B13	V1960/14261	Tabacco sales in Victoria [1.00 cms]	1960 - 1962	1960-01-01 00:00:00	1962-01-01 00:00:00	Open	Melbourne	False
3	3280538	B13	V1979/4475	James Richardson Co Pty Ltd, Licensed Warehouse, Richardsons Bond [Contains plans of Richardssons Bons] [4.00 cms]	1963 - 1984	1963-01-01 00:00:00	1984-01-01 00:00:00	Open with exception	Melbourne	False
4	3283801	B13	V1953/12491	Tobacco & Cigarettes - Duty Free issues to Ships Crews [2cm]	1945 - 1957	1945-01-01 00:00:00	1957-01-01 00:00:00	Open	Melbourne	False

Plot content dates¶

In [12]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [13]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [14]:

series_details.display_word_counts(title_text)

Out[14]:

	word	count
7	per	5,172
314	ex	4,363
902	exemption	3,688
618	certificate	3,577
1655	dictation	3,577
1581	test	3,553
71	melbourne	3,168
538	application	2,442
174	departure	2,006
14	australia	1,977
1543	ah	1,796
949	passengers	1,620
173	arrival	1,560
26	ltd	1,446
104	act	1,180
1482	mrs	1,175
6	sydney	1,075
1861	s.s	1,074
12	permission	1,050
830	crew	1,015
25	pty	950
1621	applied	927
1583	chinese	862
2049	enemy	858
24	co	835

In [15]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	exemption from	3,550
1	from dictation	3,540
2	dictation test	3,533
3	for exemption	3,004
4	certificate for	2,872
5	for certificate	2,660
6	application for	2,236
7	melbourne per	1,054
8	departure per	1,009
9	pty ltd	927
10	applied for	923
11	to australia	859
12	trading with	782
13	enemy act	765
14	with enemy	764
15	test ah	735
16	permission to	698
17	act 1939	691
18	of exemption	636
19	certificate of	625
20	crew member	623
21	arrival per	458
22	to enter	437
23	of certificate	419
24	passengers melbourne	390

In [ ]: