In [1]:

series = 'J2481'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series J2481

Proclamations under The Chinese Immigration Restriction Act 1888 & related correspondence, annual single number series

Total items	858
Access status
Open	858 (100.00%)
Number of items digitised	858 (100.00%)
Number of pages digitised	2,031
Date of earliest content	1897
Date of latest content	1903

Download the complete CSV file

Content preview¶

In [4]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[4]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status	digitised_pages
0	5043565	J2481	1898/1	Chan Fong	1898 - 1899	1898-01-01 00:00:00	1899-01-01 00:00:00	Open	Brisbane	True	2
1	5043566	J2481	1898/2	Hong Sun	1898 - 1898	1898-01-01 00:00:00	1898-01-01 00:00:00	Open	Brisbane	True	2
2	5043567	J2481	1898/3	Yong Gun	1898 - 1899	1898-01-01 00:00:00	1899-01-01 00:00:00	Open	Brisbane	True	2
3	5043568	J2481	1898/4	Ah Pow	1898 - 1900	1898-01-01 00:00:00	1900-01-01 00:00:00	Open	Brisbane	True	2
4	5043569	J2481	1898/5	Ah Choy	1898 - 1900	1898-01-01 00:00:00	1900-01-01 00:00:00	Open	Brisbane	True	3

Plot content dates¶

In [5]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [6]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [7]:

series_details.display_word_counts(title_text)

Out[7]:

	word	count
6	ah	393
24	lee	77
35	sing	51
14	sam	44
10	chong	39
71	wong	27
3	sun	26
11	kee	26
53	correspondence	24
38	wah	24
69	lum	23
49	young	23
50	see	19
25	yee	19
23	hing	18
0	chan	17
58	gee	17
41	hop	16
238	chinese	15
57	long	14
130	low	14
426	relating	14
118	chew	14
2	hong	13
107	sue	13

In [8]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	ah sam	24
1	sing ah	19
2	chong ah	17
3	ah sing	16
4	sam ah	15
5	relating to	14
6	correspondence relating	14
7	lee ah	9
8	ah wah	9
9	ah kee	9
10	yee ah	9
11	ah foon	8
12	ah young	8
13	see ah	8
14	ah see	8
15	ah chong	8
16	ah choy	8
17	wah ah	8
18	ah you	7
19	gee ah	7
20	hing ah	7
21	ah yee	7
22	ah gee	7
23	sun ah	7
24	ah lee	6

In [10]: