In [1]:

series = 'J2482'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series J2482

Certificates of Domicile issued under The Immigration Restriction Act 1901 and Regulations, annual single number series

Total items	799
Access status
Open	799 (100.00%)
Number of items digitised	798 (99.87%)
Number of pages digitised	3,153
Date of earliest content	1902
Date of latest content	1912

Download the complete CSV file

Content preview¶

In [4]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[4]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status	digitised_pages
0	5049001	J2482	1904/103	Sheong Fook of Geraldton [Innisfail], Qld - birthplace: Canton, China - departed Geraldton [Innisfail], Queensland on the Empire 7 September 1904	1904 - 1904	1904-01-01 00:00:00	1904-01-01 00:00:00	Open	Brisbane	True	4
1	5049002	J2482	1904/104	Ah Gee of Macnade near Dungeness, Qld - birthplace: Canton, China - departed Dungeness, Queensland on the Tsinan 26 June 1904	1904 - 1904	1904-01-01 00:00:00	1904-01-01 00:00:00	Open	Brisbane	True	3
2	5049003	J2482	1904/105	Ah Yeen of Johnstone near Geraldton [Innisfail] - birthplace: Canton, China - departed Geraldton [Innisfail], Queensland on the Tsinan 25 June 1904	1904 - 1904	1904-01-01 00:00:00	1904-01-01 00:00:00	Open	Brisbane	True	4
3	5049004	J2482	1904/106	Khardin of Hambleton, Cairns, Qld - birthplace: Punjaub, India - departed Cairns, Queensland 28 August 1908	1904 - 1908	1904-01-01 00:00:00	1908-01-01 00:00:00	Open	Brisbane	True	5
4	5049005	J2482	1904/108	Yep Fat of Junda, Qld - birthplace: Canton, China - departed Brisbane on the Tsinan 20 June 1904	1904 - 1904	1904-01-01 00:00:00	1904-01-01 00:00:00	Open	Brisbane	True	4

Plot content dates¶

In [5]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [6]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [7]:

series_details.display_word_counts(title_text)

Out[7]:

	word	count
9	queensland	950
5	birthplace	745
8	departed	730
7	china	722
6	canton	636
27	cairns	402
4	qld	375
66	1905	269
14	ah	259
13	1904	226
36	brisbane	207
46	townsville	173
126	australian	163
2	geraldton	156
99	island	145
106	thursday	140
83	eastern	136
10	empire	130
635	1903	120
85	november	100
17	near	98
58	1906	97
74	december	97
3	innisfail	96
206	january	78

In [8]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	china departed	690
1	on the	678
2	canton china	636
3	birthplace canton	635
4	queensland on	587
5	qld birthplace	375
6	queensland birthplace	318
7	cairns queensland	258
8	departed cairns	221
9	the australian	163
10	brisbane queensland	160
11	townsville queensland	141
12	thursday island	140
13	the eastern	136
14	departed brisbane	131
15	the empire	130
16	cairns qld	109
17	departed townsville	108
18	geraldton innisfail	96
19	of cairns	82
20	of geraldton	82
21	near cairns	81
22	departed geraldton	73
23	departed thursday	72
24	1905 ah	68

In [10]: