series = 'K1145'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 4,816 |
---|---|
Access status | |
Open | 4,791 (99.48%) |
Not yet examined | 25 (0.52%) |
Number of items digitised | 175 (3.63%) |
Number of pages digitised | 874 |
Date of earliest content | 1900 |
Date of latest content | 1955 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1719426 | K1145 | 1900/95 | Ah Kin [Chinese] | 1900 - 1900 | 1900-01-01 00:00:00 | 1900-01-01 00:00:00 | Open | Perth | False | 0 |
1 | 1719427 | K1145 | 1900/116 | Ah Leck [Chinese] | 1900 - 1900 | 1900-01-01 00:00:00 | 1900-01-01 00:00:00 | Open | Perth | False | 0 |
2 | 1719428 | K1145 | 1900/144 | Ah Shim [Chinese] | 1900 - 1900 | 1900-01-01 00:00:00 | 1900-01-01 00:00:00 | Open | Perth | False | 0 |
3 | 1719429 | K1145 | 1900/165 | Mahomet Rasool [Afghan] | 1900 - 1900 | 1900-01-01 00:00:00 | 1900-01-01 00:00:00 | Open | Perth | False | 0 |
4 | 1719431 | K1145 | 1900/169 | Lee Yacke [Chinese] | 1900 - 1900 | 1900-01-01 00:00:00 | 1900-01-01 00:00:00 | Open | Perth | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
2 | chinese | 3,293 |
0 | ah | 1,064 |
90 | japanese | 708 |
30 | indian | 593 |
8 | lee | 321 |
57 | fong | 222 |
139 | wong | 192 |
12 | sing | 158 |
7 | afghan | 144 |
122 | singh | 131 |
54 | chen | 123 |
40 | chong | 98 |
172 | chung | 89 |
44 | wing | 88 |
75 | yee | 78 |
167 | chew | 78 |
210 | mahomed | 73 |
67 | wah | 72 |
157 | chin | 69 |
71 | kee | 68 |
66 | hong | 63 |
34 | khan | 61 |
323 | sam | 58 |
79 | hing | 57 |
673 | born | 49 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | chinese ah | 741 |
1 | chinese lee | 199 |
2 | singh indian | 129 |
3 | japanese ah | 122 |
4 | sing chinese | 110 |
5 | chinese wong | 106 |
6 | indian ah | 91 |
7 | fong chinese | 78 |
8 | chinese fong | 78 |
9 | chinese chen | 77 |
10 | chong chinese | 70 |
11 | you chinese | 66 |
12 | kee chinese | 62 |
13 | lee chinese | 53 |
14 | wah chinese | 50 |
15 | hong chinese | 50 |
16 | wing chinese | 49 |
17 | chinese chung | 47 |
18 | hing chinese | 46 |
19 | shing chinese | 45 |
20 | ah sing | 45 |
21 | bux indian | 43 |
22 | chinese yee | 42 |
23 | chew chinese | 41 |
24 | sam chinese | 40 |