series = 'J2481'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 858 |
---|---|
Access status | |
Open | 858 (100.00%) |
Number of items digitised | 858 (100.00%) |
Number of pages digitised | 2,031 |
Date of earliest content | 1897 |
Date of latest content | 1903 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5043565 | J2481 | 1898/1 | Chan Fong | 1898 - 1899 | 1898-01-01 00:00:00 | 1899-01-01 00:00:00 | Open | Brisbane | True | 2 |
1 | 5043566 | J2481 | 1898/2 | Hong Sun | 1898 - 1898 | 1898-01-01 00:00:00 | 1898-01-01 00:00:00 | Open | Brisbane | True | 2 |
2 | 5043567 | J2481 | 1898/3 | Yong Gun | 1898 - 1899 | 1898-01-01 00:00:00 | 1899-01-01 00:00:00 | Open | Brisbane | True | 2 |
3 | 5043568 | J2481 | 1898/4 | Ah Pow | 1898 - 1900 | 1898-01-01 00:00:00 | 1900-01-01 00:00:00 | Open | Brisbane | True | 2 |
4 | 5043569 | J2481 | 1898/5 | Ah Choy | 1898 - 1900 | 1898-01-01 00:00:00 | 1900-01-01 00:00:00 | Open | Brisbane | True | 3 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
6 | ah | 393 |
24 | lee | 77 |
35 | sing | 51 |
14 | sam | 44 |
10 | chong | 39 |
71 | wong | 27 |
3 | sun | 26 |
11 | kee | 26 |
53 | correspondence | 24 |
38 | wah | 24 |
69 | lum | 23 |
49 | young | 23 |
50 | see | 19 |
25 | yee | 19 |
23 | hing | 18 |
0 | chan | 17 |
58 | gee | 17 |
41 | hop | 16 |
238 | chinese | 15 |
57 | long | 14 |
130 | low | 14 |
426 | relating | 14 |
118 | chew | 14 |
2 | hong | 13 |
107 | sue | 13 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | ah sam | 24 |
1 | sing ah | 19 |
2 | chong ah | 17 |
3 | ah sing | 16 |
4 | sam ah | 15 |
5 | relating to | 14 |
6 | correspondence relating | 14 |
7 | lee ah | 9 |
8 | ah wah | 9 |
9 | ah kee | 9 |
10 | yee ah | 9 |
11 | ah foon | 8 |
12 | ah young | 8 |
13 | see ah | 8 |
14 | ah see | 8 |
15 | ah chong | 8 |
16 | ah choy | 8 |
17 | wah ah | 8 |
18 | ah you | 7 |
19 | gee ah | 7 |
20 | hing ah | 7 |
21 | ah yee | 7 |
22 | ah gee | 7 |
23 | sun ah | 7 |
24 | ah lee | 6 |