series = 'E752'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 722 |
---|---|
Access status | |
Open | 719 (99.58%) |
Not yet examined | 3 (0.42%) |
Number of items digitised | 717 (99.31%) |
Number of pages digitised | 9,310 |
Date of earliest content | 1905 |
Date of latest content | 1941 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1591875 | E752 | 1916/1 | [Certificate of Exemption from Dictation Test - Sun Took] | 1916 - 1916 | 1916-01-01 00:00:00 | 1916-01-01 00:00:00 | Open | Darwin | True | 18 |
1 | 1591876 | E752 | 1916/2 | [Certificate of Exemption from Dictation Test - Ah Young] | 1918 - 1918 | 1918-01-01 00:00:00 | 1918-01-01 00:00:00 | Open | Darwin | True | 15 |
2 | 1591878 | E752 | 1916/3 | [Certificate of Exemption from Dictation Test - Cheong Yee] | 1918 - 1918 | 1918-01-01 00:00:00 | 1918-01-01 00:00:00 | Open | Darwin | True | 16 |
3 | 1591880 | E752 | 1916/4 | [Certificate of Exemption from Dictation Test - Chin Dick] | 1917 - 1917 | 1917-01-01 00:00:00 | 1917-01-01 00:00:00 | Open | Darwin | True | 15 |
4 | 1591883 | E752 | 1916/5 | [Certificate of Exemption from Dictation Test - Chin See Koon] | 1917 - 1917 | 1917-01-01 00:00:00 | 1917-01-01 00:00:00 | Open | Darwin | True | 15 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
1 | exemption | 714 |
2 | dictation | 713 |
3 | test | 713 |
0 | certificate | 709 |
10 | chin | 127 |
6 | ah | 73 |
33 | fong | 64 |
17 | gee | 48 |
37 | lee | 39 |
138 | sing | 28 |
42 | yuen | 26 |
12 | see | 25 |
66 | kim | 25 |
9 | yee | 23 |
115 | wong | 23 |
16 | sue | 21 |
46 | wah | 20 |
29 | chong | 19 |
22 | hong | 19 |
45 | low | 17 |
67 | loong | 15 |
98 | bow | 14 |
26 | gum | 14 |
141 | ming | 14 |
64 | quan | 13 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | exemption from | 714 |
1 | of exemption | 714 |
2 | from dictation | 713 |
3 | dictation test | 710 |
4 | certificate of | 708 |
5 | test chin | 100 |
6 | test ah | 54 |
7 | test fong | 43 |
8 | test gee | 32 |
9 | sing certificate | 22 |
10 | test wong | 22 |
11 | test lee | 16 |
12 | test low | 15 |
13 | see certificate | 14 |
14 | yee certificate | 13 |
15 | hong certificate | 12 |
16 | yuen certificate | 12 |
17 | kim certificate | 11 |
18 | sue certificate | 10 |
19 | test yuen | 10 |
20 | fong certificate | 10 |
21 | test loong | 9 |
22 | way certificate | 9 |
23 | test gum | 9 |
24 | test ching | 8 |