series = 'J3115'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 161 |
---|---|
Access status | |
Open | 161 (100.00%) |
Number of items digitised | 161 (100.00%) |
Number of pages digitised | 1,344 |
Date of earliest content | 1899 |
Date of latest content | 1928 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5058001 | J3115 | 1 | Certificate of Domicile for Mah Wah, a market gardener from Bundaberg - includes photographs | 1902 - 1902 | 1902-01-01 00:00:00 | 1902-01-01 00:00:00 | Open | Brisbane | True | 1 |
1 | 5058002 | J3115 | 2 | Certificate of Domicile for Sui Tim, a fruiterer and general merchant from Brisbane - includes photographs | 1902 - 1903 | 1902-01-01 00:00:00 | 1903-01-01 00:00:00 | Open | Brisbane | True | 1 |
2 | 5058003 | J3115 | 50 | Certificate of Domicile for Charlie Jock, a storekeeper from Clermont - includes photographs | 1903 - 1905 | 1903-01-01 00:00:00 | 1905-01-01 00:00:00 | Open | Brisbane | True | 2 |
3 | 5058004 | J3115 | 3 | Certificate of Domicile for Tommy Young Hopp, a cook from Brisbane - includes photographs | 1902 - 1903 | 1902-01-01 00:00:00 | 1903-01-01 00:00:00 | Open | Brisbane | True | 1 |
4 | 5058005 | J3115 | 4 | Certificate of Domicile for Jong Hee, the owner of a paper bag factory from Brisbane - includes photographs | 1902 - 1904 | 1902-01-01 00:00:00 | 1904-01-01 00:00:00 | Open | Brisbane | True | 1 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
0 | certificate | 168 |
7 | includes | 139 |
1 | domicile | 109 |
8 | photographs | 100 |
45 | ah | 74 |
106 | application | 55 |
181 | photograph | 46 |
104 | correspondence | 43 |
17 | storekeeper | 41 |
178 | exemption | 40 |
5 | gardener | 36 |
105 | relating | 35 |
14 | brisbane | 17 |
158 | birth | 15 |
391 | pages | 14 |
174 | born | 13 |
194 | queensland | 13 |
39 | island | 13 |
44 | townsville | 13 |
115 | also | 13 |
13 | merchant | 12 |
38 | thursday | 12 |
154 | father | 12 |
48 | lee | 11 |
65 | hing | 11 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | certificate of | 150 |
1 | of domicile | 109 |
2 | includes photographs | 94 |
3 | domicile for | 93 |
4 | photographs certificate | 51 |
5 | application for | 48 |
6 | of exemption | 40 |
7 | for certificate | 38 |
8 | includes photograph | 36 |
9 | relating to | 35 |
10 | for ah | 35 |
11 | a storekeeper | 33 |
12 | correspondence relating | 33 |
13 | storekeeper from | 29 |
14 | exemption for | 23 |
15 | a gardener | 23 |
16 | gardener from | 22 |
17 | to the | 21 |
18 | photographs application | 15 |
19 | a certificate | 15 |
20 | photograph certificate | 15 |
21 | for a | 14 |
22 | birth certificate | 14 |
23 | from the | 13 |
24 | and correspondence | 13 |