series = 'J2482'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 799 |
---|---|
Access status | |
Open | 799 (100.00%) |
Number of items digitised | 798 (99.87%) |
Number of pages digitised | 3,153 |
Date of earliest content | 1902 |
Date of latest content | 1912 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5049001 | J2482 | 1904/103 | Sheong Fook of Geraldton [Innisfail], Qld - birthplace: Canton, China - departed Geraldton [Innisfail], Queensland on the Empire 7 September 1904 | 1904 - 1904 | 1904-01-01 00:00:00 | 1904-01-01 00:00:00 | Open | Brisbane | True | 4 |
1 | 5049002 | J2482 | 1904/104 | Ah Gee of Macnade near Dungeness, Qld - birthplace: Canton, China - departed Dungeness, Queensland on the Tsinan 26 June 1904 | 1904 - 1904 | 1904-01-01 00:00:00 | 1904-01-01 00:00:00 | Open | Brisbane | True | 3 |
2 | 5049003 | J2482 | 1904/105 | Ah Yeen of Johnstone near Geraldton [Innisfail] - birthplace: Canton, China - departed Geraldton [Innisfail], Queensland on the Tsinan 25 June 1904 | 1904 - 1904 | 1904-01-01 00:00:00 | 1904-01-01 00:00:00 | Open | Brisbane | True | 4 |
3 | 5049004 | J2482 | 1904/106 | Khardin of Hambleton, Cairns, Qld - birthplace: Punjaub, India - departed Cairns, Queensland 28 August 1908 | 1904 - 1908 | 1904-01-01 00:00:00 | 1908-01-01 00:00:00 | Open | Brisbane | True | 5 |
4 | 5049005 | J2482 | 1904/108 | Yep Fat of Junda, Qld - birthplace: Canton, China - departed Brisbane on the Tsinan 20 June 1904 | 1904 - 1904 | 1904-01-01 00:00:00 | 1904-01-01 00:00:00 | Open | Brisbane | True | 4 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
9 | queensland | 950 |
5 | birthplace | 745 |
8 | departed | 730 |
7 | china | 722 |
6 | canton | 636 |
27 | cairns | 402 |
4 | qld | 375 |
66 | 1905 | 269 |
14 | ah | 259 |
13 | 1904 | 226 |
36 | brisbane | 207 |
46 | townsville | 173 |
126 | australian | 163 |
2 | geraldton | 156 |
99 | island | 145 |
106 | thursday | 140 |
83 | eastern | 136 |
10 | empire | 130 |
635 | 1903 | 120 |
85 | november | 100 |
17 | near | 98 |
58 | 1906 | 97 |
74 | december | 97 |
3 | innisfail | 96 |
206 | january | 78 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | china departed | 690 |
1 | on the | 678 |
2 | canton china | 636 |
3 | birthplace canton | 635 |
4 | queensland on | 587 |
5 | qld birthplace | 375 |
6 | queensland birthplace | 318 |
7 | cairns queensland | 258 |
8 | departed cairns | 221 |
9 | the australian | 163 |
10 | brisbane queensland | 160 |
11 | townsville queensland | 141 |
12 | thursday island | 140 |
13 | the eastern | 136 |
14 | departed brisbane | 131 |
15 | the empire | 130 |
16 | cairns qld | 109 |
17 | departed townsville | 108 |
18 | geraldton innisfail | 96 |
19 | of cairns | 82 |
20 | of geraldton | 82 |
21 | near cairns | 81 |
22 | departed geraldton | 73 |
23 | departed thursday | 72 |
24 | 1905 ah | 68 |