In [1]:

series = 'K1145'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series K1145

Certificates of Exemption from Dictation Test, annual certificate number order

Total items	4,816
Access status
Open	4,791 (99.48%)
Not yet examined	25 (0.52%)
Number of items digitised	175 (3.63%)
Number of pages digitised	874
Date of earliest content	1900
Date of latest content	1955

Download the complete CSV file

Content preview¶

In [4]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[4]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status
0	1719426	K1145	1900/95	Ah Kin [Chinese]	1900 - 1900	1900-01-01 00:00:00	1900-01-01 00:00:00	Open	Perth	False
1	1719427	K1145	1900/116	Ah Leck [Chinese]	1900 - 1900	1900-01-01 00:00:00	1900-01-01 00:00:00	Open	Perth	False
2	1719428	K1145	1900/144	Ah Shim [Chinese]	1900 - 1900	1900-01-01 00:00:00	1900-01-01 00:00:00	Open	Perth	False
3	1719429	K1145	1900/165	Mahomet Rasool [Afghan]	1900 - 1900	1900-01-01 00:00:00	1900-01-01 00:00:00	Open	Perth	False
4	1719431	K1145	1900/169	Lee Yacke [Chinese]	1900 - 1900	1900-01-01 00:00:00	1900-01-01 00:00:00	Open	Perth	False

Plot content dates¶

In [5]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [6]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [7]:

series_details.display_word_counts(title_text)

Out[7]:

	word	count
2	chinese	3,293
0	ah	1,064
90	japanese	708
30	indian	593
8	lee	321
57	fong	222
139	wong	192
12	sing	158
7	afghan	144
122	singh	131
54	chen	123
40	chong	98
172	chung	89
44	wing	88
75	yee	78
167	chew	78
210	mahomed	73
67	wah	72
157	chin	69
71	kee	68
66	hong	63
34	khan	61
323	sam	58
79	hing	57
673	born	49

In [8]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	chinese ah	741
1	chinese lee	199
2	singh indian	129
3	japanese ah	122
4	sing chinese	110
5	chinese wong	106
6	indian ah	91
7	fong chinese	78
8	chinese fong	78
9	chinese chen	77
10	chong chinese	70
11	you chinese	66
12	kee chinese	62
13	lee chinese	53
14	wah chinese	50
15	hong chinese	50
16	wing chinese	49
17	chinese chung	47
18	hing chinese	46
19	shing chinese	45
20	ah sing	45
21	bux indian	43
22	chinese yee	42
23	chew chinese	41
24	sam chinese	40

In [10]: