In [1]:

series = 'BP343/15'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series BP343/15

Registers of aliens departing from the Port of Townsville who were granted a certificate exempting from dictation test [CEDT]

Total items	2,571
Access status
Open	2,566 (99.81%)
Not yet examined	5 (0.19%)
Number of items digitised	85 (3.31%)
Number of pages digitised	176
Date of earliest content	1916
Date of latest content	1955

Download the complete CSV file

Content preview¶

In [5]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[5]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status
0	9103820	BP343/15	14/1013	Name: Lum Yee - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 466/21	1929 - 1932	1929-01-01 00:00:00	1932-01-01 00:00:00	Open	Brisbane	False
1	9108210	BP343/15	13/824	Name: Hoo Wah (of Townsville) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/23	1928 - 1929	1928-01-01 00:00:00	1929-01-01 00:00:00	Open	Brisbane	False
2	9108211	BP343/15	13/823	Name: Ah Cow (of Charters Towers) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/19	1928 - 1928	1928-01-01 00:00:00	1928-01-01 00:00:00	Open	Brisbane	False
3	9108212	BP343/15	13/822	Name: Bon Kan [Bu Conn] (of Townsville) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/28	1928 - 1928	1928-01-01 00:00:00	1928-01-01 00:00:00	Open	Brisbane	False
4	9108213	BP343/15	13/821	Name: Ah Hat - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/17	1928 - 1928	1928-01-01 00:00:00	1928-01-01 00:00:00	Open	Brisbane	False

Plot content dates¶

In [6]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [7]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [8]:

series_details.display_word_counts(title_text)

Out[8]:

	word	count
0	name	2,565
3	nationality	2,542
5	birthplace	2,460
12	number	2,323
7	certificate	2,322
11	cedt	2,315
9	dictation	2,313
10	test	2,313
8	exemption	2,312
4	chinese	2,189
6	canton	1,950
16	townsville	852
18	ah	447
73	lee	242
174	japanese	195
175	japan	177
36	chong	129
89	indian	122
93	sing	121
145	wong	112
77	leong	112
2	yee	110
1	lum	109
15	wah	104
171	india	97

In [9]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	cedt number	2,315
1	dictation test	2,313
2	the dictation	2,312
3	certificate of	2,312
4	of exemption	2,312
5	from the	2,312
6	exemption from	2,312
7	test cedt	2,312
8	nationality chinese	2,171
9	chinese birthplace	2,109
10	birthplace canton	1,949
11	canton certificate	1,854
12	of townsville	832
13	townsville nationality	830
14	name ah	322
15	nationality japanese	195
16	japanese birthplace	182
17	japan certificate	166
18	birthplace japan	165
19	name lee	149
20	nationality indian	117
21	indian birthplace	111
22	canton name	93
23	india certificate	91
24	name leong	87

In [ ]: