series = 'BP343/15'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 2,571 |
---|---|
Access status | |
Open | 2,566 (99.81%) |
Not yet examined | 5 (0.19%) |
Number of items digitised | 85 (3.31%) |
Number of pages digitised | 176 |
Date of earliest content | 1916 |
Date of latest content | 1955 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9103820 | BP343/15 | 14/1013 | Name: Lum Yee - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 466/21 | 1929 - 1932 | 1929-01-01 00:00:00 | 1932-01-01 00:00:00 | Open | Brisbane | False | 0 |
1 | 9108210 | BP343/15 | 13/824 | Name: Hoo Wah (of Townsville) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/23 | 1928 - 1929 | 1928-01-01 00:00:00 | 1929-01-01 00:00:00 | Open | Brisbane | False | 0 |
2 | 9108211 | BP343/15 | 13/823 | Name: Ah Cow (of Charters Towers) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/19 | 1928 - 1928 | 1928-01-01 00:00:00 | 1928-01-01 00:00:00 | Open | Brisbane | False | 0 |
3 | 9108212 | BP343/15 | 13/822 | Name: Bon Kan [Bu Conn] (of Townsville) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/28 | 1928 - 1928 | 1928-01-01 00:00:00 | 1928-01-01 00:00:00 | Open | Brisbane | False | 0 |
4 | 9108213 | BP343/15 | 13/821 | Name: Ah Hat - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/17 | 1928 - 1928 | 1928-01-01 00:00:00 | 1928-01-01 00:00:00 | Open | Brisbane | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
0 | name | 2,565 |
3 | nationality | 2,542 |
5 | birthplace | 2,460 |
12 | number | 2,323 |
7 | certificate | 2,322 |
11 | cedt | 2,315 |
9 | dictation | 2,313 |
10 | test | 2,313 |
8 | exemption | 2,312 |
4 | chinese | 2,189 |
6 | canton | 1,950 |
16 | townsville | 852 |
18 | ah | 447 |
73 | lee | 242 |
174 | japanese | 195 |
175 | japan | 177 |
36 | chong | 129 |
89 | indian | 122 |
93 | sing | 121 |
145 | wong | 112 |
77 | leong | 112 |
2 | yee | 110 |
1 | lum | 109 |
15 | wah | 104 |
171 | india | 97 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | cedt number | 2,315 |
1 | dictation test | 2,313 |
2 | the dictation | 2,312 |
3 | certificate of | 2,312 |
4 | of exemption | 2,312 |
5 | from the | 2,312 |
6 | exemption from | 2,312 |
7 | test cedt | 2,312 |
8 | nationality chinese | 2,171 |
9 | chinese birthplace | 2,109 |
10 | birthplace canton | 1,949 |
11 | canton certificate | 1,854 |
12 | of townsville | 832 |
13 | townsville nationality | 830 |
14 | name ah | 322 |
15 | nationality japanese | 195 |
16 | japanese birthplace | 182 |
17 | japan certificate | 166 |
18 | birthplace japan | 165 |
19 | name lee | 149 |
20 | nationality indian | 117 |
21 | indian birthplace | 111 |
22 | canton name | 93 |
23 | india certificate | 91 |
24 | name leong | 87 |