In [1]:

series = 'SP115/10'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series SP115/10

Certificates Exempting from the provisions of 'The Influx of Chinese Restriction Act 1881'

Total items	6
Access status
Open	6 (100.00%)
Number of items digitised	0 (0.00%)
Number of pages digitised	0
Date of earliest content	1884
Date of latest content	1888

Download the complete CSV file

Content preview¶

In [4]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[4]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status
0	1826044	SP115/10	WHOLE SERIES	Certificates Exempting from the provisions of 'The Influx of Chinese Restriction Act 1881'	1884 - 1888	1884-01-01 00:00:00	1888-01-01 00:00:00	Open	Sydney	False
1	11014873	SP115/10	804	Ah Luck [box 1]	1884 - 1884	1884-01-01 00:00:00	1884-01-01 00:00:00	Open	Sydney	False
2	12145978	SP115/10	800	Ah See [box 1]	1884 - 1884	1884-01-01 00:00:00	1884-01-01 00:00:00	Open	Sydney	False
3	12187508	SP115/10	1092	Ah Gee [box 1]	1885 - 1885	1885-01-01 00:00:00	1885-01-01 00:00:00	Open	Sydney	False
4	12255210	SP115/10	1643	Ah Kum [box 1]	1886 - 1886	1886-01-01 00:00:00	1886-01-01 00:00:00	Open	Sydney	False

Plot content dates¶

In [5]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [6]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [7]:

series_details.display_word_counts(title_text)

Out[7]:

	word	count
8	ah	5
10	box	5
11	1	5
0	certificates	1
1	exempting	1
2	provisions	1
3	influx	1
4	chinese	1
5	restriction	1
6	act	1
7	1881	1
9	luck	1
12	see	1
13	gee	1
14	kum	1
15	gow	1

In [8]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	box 1	5
1	1 ah	4
2	restriction act	1
3	ah see	1
4	certificates exempting	1
5	act 1881	1
6	from the	1
7	ah luck	1
8	ah gee	1
9	gow box	1
10	luck box	1
11	influx of	1
12	see box	1
13	the provisions	1
14	ah kum	1
15	ah gow	1
16	provisions of	1
17	'the influx	1
18	chinese restriction	1
19	of chinese	1
20	kum box	1
21	gee box	1
22	1881 ah	1
23	exempting from	1
24	of 'the	1

In [ ]: