series = 'ST84/1'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 2,765 |
---|---|
Access status | |
Open | 2,758 (99.75%) |
Not yet examined | 7 (0.25%) |
Number of items digitised | 434 (15.70%) |
Number of pages digitised | 13,979 |
Date of earliest content | 1855 |
Date of latest content | 1975 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1731871 | ST84/1 | 1907/391-400 | James Lee Chong, Way Sing, Walter Hing Hee, Ah See, Charlie Joy, Nicholas Saseen, Foo Jun, Hop Sing, Sun Sing Lee and Jack Hoy [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 16] | 1907 - 1907 | 1907-01-01 00:00:00 | 1907-01-01 00:00:00 | Open | Sydney | True | 34 |
1 | 7288001 | ST84/1 | 1919/270/81-90 | Jong Say, Wong Kwong, Lee You Wing, Foo Gun, Mar Kum, Gock Buck, Ah Get, Jeong Keong, Percy Zuinn and Ah Yum [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 122] | 1919 - 1919 | 1919-01-01 00:00:00 | 1919-01-01 00:00:00 | Open | Sydney | False | 0 |
2 | 7288002 | ST84/1 | 1919/270/91-100 | Ming Gar, Ah Loong, Lun Soy, Gung Sun, Ah Lock, John Nop or Jan Nap, Peter Sing, Louie Wee, Sue Hoo and Lee Yuen [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 122] | 1919 - 1919 | 1919-01-01 00:00:00 | 1919-01-01 00:00:00 | Open | Sydney | False | 0 |
3 | 7288003 | ST84/1 | 1919/271/1-10 | Charley Eip, Chew Bun, Lee Chut, Lum Gow, Tommy Low, Low Zuai, Charlie Gong or Charlie Kwong, Lee So, Chi Wort and Go Foo [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 123] | 1919 - 1919 | 1919-01-01 00:00:00 | 1919-01-01 00:00:00 | Open | Sydney | False | 0 |
4 | 7288004 | ST84/1 | 1919/271/11-20 | Dewan Singh, Joseph Sequiera, Lee Gum Sue, Ah Suey, Fong Foon, Ah Seck, Man Duck, Lee Tim and Ah Moon [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 123] | 1919 - 1919 | 1919-01-01 00:00:00 | 1919-01-01 00:00:00 | Open | Sydney | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
8 | ah | 6,070 |
29 | box | 2,830 |
20 | certificate | 2,747 |
1 | lee | 2,737 |
24 | includes | 2,735 |
26 | hand | 2,732 |
25 | left | 2,732 |
27 | impression | 2,732 |
28 | photographs | 2,715 |
21 | exempting | 2,646 |
23 | test | 2,646 |
22 | dictation | 2,646 |
2 | chong | 1,228 |
4 | sing | 1,171 |
33 | wong | 1,060 |
107 | young | 1,000 |
108 | yee | 934 |
110 | george | 702 |
170 | choy | 679 |
6 | hing | 653 |
73 | low | 650 |
10 | charlie | 612 |
70 | lum | 609 |
86 | fong | 580 |
95 | gee | 575 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | includes left | 2,732 |
1 | hand impression | 2,732 |
2 | left hand | 2,732 |
3 | photographs box | 2,715 |
4 | impression and | 2,712 |
5 | and photographs | 2,712 |
6 | dictation test | 2,646 |
7 | exempting from | 2,646 |
8 | from dictation | 2,646 |
9 | certificate exempting | 2,646 |
10 | test includes | 2,639 |
11 | and ah | 518 |
12 | or ah | 326 |
13 | lee ah | 199 |
14 | ah chong | 173 |
15 | ah sing | 172 |
16 | sing ah | 166 |
17 | ah sam | 165 |
18 | and lee | 152 |
19 | chong ah | 145 |
20 | sydney nsw | 128 |
21 | lee and | 128 |
22 | lee certificate | 125 |
23 | ah tong | 110 |
24 | sing and | 104 |