In [2]:

series = 'B6003'

In [3]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [4]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series B6003

Registers of Certificates Exempting from the Dictation Test (Departures), Melbourne

Total items	3
Access status
Open	3 (100.00%)
Number of items digitised	0 (0.00%)
Number of pages digitised	0
Date of earliest content	1904
Date of latest content	1959

Download the complete CSV file

Content preview¶

In [5]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[5]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status
0	1072442	B6003	1	Register of Certificates Exempting from the Dictation Test (Departures), Melbourne 1904 - 1914	1904 - 1914	1904-01-01 00:00:00	1914-01-01 00:00:00	Open	Melbourne	False
1	1072443	B6003	2	Register of Certificates Exempting from the Dictation Test (Departures), Melbourne 1915 - 1933	1915 - 1933	1915-01-01 00:00:00	1933-01-01 00:00:00	Open	Melbourne	False
2	1072444	B6003	3	Register of Certificates Exempting from the Dictation Test (Departures), Melbourne 1934 - 1959	1934 - 1959	1934-01-01 00:00:00	1959-01-01 00:00:00	Open	Melbourne	False

Plot content dates¶

In [5]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [6]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [7]:

series_details.display_word_counts(title_text)

Out[7]:

	word	count
0	register	3
1	certificates	3
2	exempting	3
3	dictation	3
4	test	3
5	departures	3
6	melbourne	3
7	1904	1
8	1914	1
9	1915	1
10	1933	1
11	1934	1
12	1959	1

In [8]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	the dictation	3
1	test departures	3
2	of certificates	3
3	from the	3
4	dictation test	3
5	exempting from	3
6	departures melbourne	3
7	register of	3
8	certificates exempting	3
9	1933 register	1
10	1914 register	1
11	melbourne 1904	1
12	melbourne 1915	1
13	1904 1914	1
14	1915 1933	1
15	1934 1959	1
16	melbourne 1934	1

In [10]: