series = 'J2483'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 14,438 |
---|---|
Access status | |
Open | 14,436 (99.99%) |
Not yet examined | 2 (0.01%) |
Number of items digitised | 14,436 (99.99%) |
Number of pages digitised | 79,210 |
Date of earliest content | 1903 |
Date of latest content | 1956 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9086001 | J2483 | 16/16 | Certificate Exempting from Dictation Test (CEDT) - Name: Yong Min - Nationality: Chinese - Birthplace: Canton - departed for China per TAIYUAN on 3 February 1909 | 1909 - 1909 | 1909-01-01 00:00:00 | 1909-01-01 00:00:00 | Open | Brisbane | True | 2 |
1 | 9086002 | J2483 | 16/17 | Certificate Exempting from Dictation Test (CEDT) - Name: Hong Chin - Nationality: Chinese - Birthplace: Canton - departed for China per TAIYUAN on 3 February 1909, returned to Cairns per EMPIRE on 16 June 1910 | 1909 - 1910 | 1909-01-01 00:00:00 | 1910-01-01 00:00:00 | Open | Brisbane | True | 7 |
2 | 9086003 | J2483 | 16/18 | Certificate Exempting from Dictation Test (CEDT) - Name: Ah Mun - Nationality: Chinese - Birthplace: Canton - departed for China per SS EASTERN on 11 June 1909, returned to Cairns per EASTERN on 22 October 1910 | 1909 - 1910 | 1909-01-01 00:00:00 | 1910-01-01 00:00:00 | Open | Brisbane | True | 7 |
3 | 9086004 | J2483 | 16/21 | Certificate Exempting from Dictation Test (CEDT) - Name: Tommy Hong - Nationality: Chinese - Birthplace: Canton - departed for China per EMPIRE on 17 February 1909, returned to Brisbane per EMPIRE on 6 November 1911 | 1909 - 1911 | 1909-01-01 00:00:00 | 1911-01-01 00:00:00 | Open | Brisbane | True | 7 |
4 | 9086005 | J2483 | 16/22 | Certificate Exempting from Dictation Test (CEDT) - Name: Duck Shan - Nationality: Chinese - Birthplace: Canton - departed for China per EMPIRE on 18 February 1909 | 1909 - 1909 | 1909-01-01 00:00:00 | 1909-01-01 00:00:00 | Open | Brisbane | True | 2 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
14 | per | 24,566 |
0 | certificate | 14,547 |
4 | cedt | 14,452 |
5 | name | 14,441 |
1 | exempting | 14,441 |
2 | dictation | 14,440 |
3 | test | 14,440 |
8 | nationality | 14,364 |
10 | birthplace | 13,830 |
12 | departed | 13,119 |
9 | chinese | 11,753 |
21 | returned | 11,529 |
13 | china | 10,988 |
11 | canton | 10,350 |
22 | cairns | 4,777 |
119 | maru | 4,649 |
45 | townsville | 4,261 |
36 | brisbane | 3,882 |
30 | eastern | 3,139 |
62 | st | 2,569 |
46 | december | 2,563 |
38 | november | 2,544 |
63 | albans | 2,540 |
27 | ah | 2,302 |
33 | october | 2,280 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | exempting from | 14,441 |
1 | certificate exempting | 14,440 |
2 | from dictation | 14,440 |
3 | dictation test | 14,440 |
4 | test cedt | 14,438 |
5 | cedt name | 14,415 |
6 | departed for | 13,078 |
7 | nationality chinese | 11,675 |
8 | chinese birthplace | 11,268 |
9 | birthplace canton | 10,289 |
10 | for china | 9,148 |
11 | china per | 9,140 |
12 | canton departed | 8,297 |
13 | returned to | 7,009 |
14 | maru on | 3,819 |
15 | townsville per | 3,443 |
16 | brisbane per | 3,172 |
17 | cairns per | 2,994 |
18 | per eastern | 2,948 |
19 | st albans | 2,538 |
20 | eastern on | 2,497 |
21 | per st | 2,354 |
22 | to brisbane | 2,015 |
23 | to townsville | 2,002 |
24 | hong kong | 1,980 |