In [1]:

series = 'J3115'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series J3115

Alien Immigration files relating to applications for Certificate of Domicile, Certificates of Exemption from the Chinese Immigration Restriction Act 1888 and Certificates of Exemption from the Dictation Test that includes photographs, birth certificates and other historical documents, imposed single number series

Total items	161
Access status
Open	161 (100.00%)
Number of items digitised	161 (100.00%)
Number of pages digitised	1,344
Date of earliest content	1899
Date of latest content	1928

Download the complete CSV file

Content preview¶

In [4]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[4]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status	digitised_pages
0	5058001	J3115	1	Certificate of Domicile for Mah Wah, a market gardener from Bundaberg - includes photographs	1902 - 1902	1902-01-01 00:00:00	1902-01-01 00:00:00	Open	Brisbane	True	1
1	5058002	J3115	2	Certificate of Domicile for Sui Tim, a fruiterer and general merchant from Brisbane - includes photographs	1902 - 1903	1902-01-01 00:00:00	1903-01-01 00:00:00	Open	Brisbane	True	1
2	5058003	J3115	50	Certificate of Domicile for Charlie Jock, a storekeeper from Clermont - includes photographs	1903 - 1905	1903-01-01 00:00:00	1905-01-01 00:00:00	Open	Brisbane	True	2
3	5058004	J3115	3	Certificate of Domicile for Tommy Young Hopp, a cook from Brisbane - includes photographs	1902 - 1903	1902-01-01 00:00:00	1903-01-01 00:00:00	Open	Brisbane	True	1
4	5058005	J3115	4	Certificate of Domicile for Jong Hee, the owner of a paper bag factory from Brisbane - includes photographs	1902 - 1904	1902-01-01 00:00:00	1904-01-01 00:00:00	Open	Brisbane	True	1

Plot content dates¶

In [5]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [6]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [7]:

series_details.display_word_counts(title_text)

Out[7]:

	word	count
0	certificate	168
7	includes	139
1	domicile	109
8	photographs	100
45	ah	74
106	application	55
181	photograph	46
104	correspondence	43
17	storekeeper	41
178	exemption	40
5	gardener	36
105	relating	35
14	brisbane	17
158	birth	15
391	pages	14
174	born	13
194	queensland	13
39	island	13
44	townsville	13
115	also	13
13	merchant	12
38	thursday	12
154	father	12
48	lee	11
65	hing	11

In [8]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	certificate of	150
1	of domicile	109
2	includes photographs	94
3	domicile for	93
4	photographs certificate	51
5	application for	48
6	of exemption	40
7	for certificate	38
8	includes photograph	36
9	relating to	35
10	for ah	35
11	a storekeeper	33
12	correspondence relating	33
13	storekeeper from	29
14	exemption for	23
15	a gardener	23
16	gardener from	22
17	to the	21
18	photographs application	15
19	a certificate	15
20	photograph certificate	15
21	for a	14
22	birth certificate	14
23	from the	13
24	and correspondence	13

In [ ]: