series = 'PP4/2'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 613 |
---|---|
Access status | |
Open | 610 (99.51%) |
Not yet examined | 3 (0.49%) |
Number of items digitised | 28 (4.57%) |
Number of pages digitised | 1,512 |
Date of earliest content | 1903 |
Date of latest content | 1947 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4317014 | PP4/2 | 1915/1801 | Ah FAN [Chinese] [Application for certificate of exemption from dictation test] | 1915 - 1915 | 1915-01-01 00:00:00 | 1915-01-01 00:00:00 | Open | Perth | True | 22 |
1 | 4317015 | PP4/2 | 1926/12 | Chung ON [Chinese] [Application for certificate of exemption from dictation test] | 1913 - 1926 | 1913-01-01 00:00:00 | 1926-01-01 00:00:00 | Open | Perth | False | 0 |
2 | 4317016 | PP4/2 | 1926/26 | Chong Ah SIE [KEE] [Chinese] [Application for certificate of exemption from dictation test] | 1918 - 1926 | 1918-01-01 00:00:00 | 1926-01-01 00:00:00 | Open | Perth | True | 32 |
3 | 4317017 | PP4/2 | 1926/53 | Joan Hoon KWONG [Chinese] [Application for certificate of exemption from dictation test] | 1926 - 1928 | 1926-01-01 00:00:00 | 1928-01-01 00:00:00 | Open | Perth | False | 0 |
4 | 4317018 | PP4/2 | 1926/57 | To King GUE [Chinese] [Application for certificate of exemption from dictation test] | 1926 - 1928 | 1926-01-01 00:00:00 | 1928-01-01 00:00:00 | Open | Perth | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
3 | application | 602 |
4 | certificate | 602 |
5 | exemption | 602 |
6 | dictation | 602 |
7 | test | 602 |
2 | chinese | 418 |
0 | ah | 129 |
33 | japanese | 109 |
28 | indian | 56 |
75 | lee | 45 |
66 | wong | 31 |
34 | fong | 28 |
76 | sing | 20 |
130 | mrs | 17 |
185 | singh | 16 |
285 | yee | 15 |
19 | chen | 14 |
9 | chong | 13 |
20 | wing | 12 |
60 | wah | 11 |
38 | arrived | 10 |
8 | chung | 10 |
157 | quan | 10 |
159 | chin | 9 |
1 | fan | 9 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | dictation test | 602 |
1 | certificate of | 602 |
2 | from dictation | 602 |
3 | of exemption | 602 |
4 | exemption from | 602 |
5 | for certificate | 601 |
6 | application for | 601 |
7 | chinese application | 416 |
8 | test ah | 117 |
9 | japanese application | 109 |
10 | indian application | 56 |
11 | test lee | 36 |
12 | test wong | 26 |
13 | test mrs | 16 |
14 | singh indian | 16 |
15 | sing chinese | 15 |
16 | test chen | 13 |
17 | fong chinese | 12 |
18 | test fong | 11 |
19 | test yee | 10 |
20 | you chinese | 10 |
21 | chong chinese | 10 |
22 | shing chinese | 8 |
23 | test chin | 7 |
24 | wing chinese | 7 |