In [1]:

series = 'SP115/1'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series SP115/1

Folders containing Certificates of Exemption and related papers for passengers arriving in Australia by ship, chronological series

Total items	1,787
Access status
Open	1,787 (100.00%)
Number of items digitised	9 (0.50%)
Number of pages digitised	285
Date of earliest content	1884
Date of latest content	1943

Download the complete CSV file

Content preview¶

In [5]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[5]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status
0	1592127	SP115/1	UGANDA - 13/05/1915 [BOX 15]	UGANDA - Date of Arrival 13/05/1915 - [Certificates of Exemption for passengers; includes photographs and hand prints][Box 15]	1912 - 1915	1912-01-01 00:00:00	1915-01-01 00:00:00	Open	Sydney	False
1	1592383	SP115/1	JOSEPH SIMMS - 13/05/1915 [BOX 15]	JOSEPH SIMMS - Date of Arrival 13/05/1915 [Certificates of Exemption for passengers; includes photographs and hand prints][Box 15]	1914 - 1915	1914-01-01 00:00:00	1915-01-01 00:00:00	Open	Sydney	False
2	1592840	SP115/1	TAIYUAN - [PART 1] - 30/05/1915 [BOX 15]	TAIYUAN - [Part 1] - Date of Arrival 30/05/1915 - [Certificates of Exemption for passengers; includes photographs and hand prints][[Box 15]	1914 - 1915	1914-01-01 00:00:00	1915-01-01 00:00:00	Open	Sydney	False
3	1592858	SP115/1	TAIYUAN -[PART 2] - 30/05/1915 [BOX 15]	TAIYUAN -[Part 2] - Date of Arrival 30/05/1915 - [Certificates of Exemption for passengers; includes photographs and hand prints][Box 15]	1905 - 1915	1905-01-01 00:00:00	1915-01-01 00:00:00	Open	Sydney	False
4	1592871	SP115/1	EASTERN - [PART 1] - 05/06/1915 [BOX 15]	EASTERN - [Part 1] - Date of Arrival 05/06/1915 [Certificates of Exemption for passengers; includes photographs and hand prints][Box 15]	1914 - 1915	1914-01-01 00:00:00	1915-01-01 00:00:00	Open	Sydney	False

Plot content dates¶

In [6]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [7]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [8]:

series_details.display_word_counts(title_text)

Out[8]:

	word	count
4	certificates	1,773
11	box	1,763
5	exemption	1,697
6	passengers	1,696
7	includes	1,690
1	date	1,687
8	photographs	1,684
2	arrival	1,682
9	hand	1,612
10	prints	1,612
31	pages	664
16	part	628
39	2cm	415
17	1	388
19	2	255
60	maru	189
22	3	169
23	4	156
36	st	140
58	1cm	137
37	albans	135
24	5	130
20	eastern	126
749	taiping	119
689	tanda	105

In [9]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	of exemption	1,697
1	exemption for	1,696
2	for passengers	1,696
3	passengers includes	1,689
4	date of	1,687
5	certificates of	1,686
6	includes photographs	1,683
7	photographs and	1,683
8	of arrival	1,682
9	hand prints	1,612
10	and hand	1,606
11	pages box	655
12	2cm box	404
13	prints 2cm	399
14	prints box	350
15	1 date	233
16	part 1	232
17	part 2	220
18	2 date	218
19	1cm box	136
20	st albans	135
21	prints 1cm	118
22	eastern part	115
23	3 date	112
24	part 3	111

In [10]: