series = 'SP11/26'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 27 |
---|---|
Access status | |
Open | 27 (100.00%) |
Number of items digitised | 5 (18.52%) |
Number of pages digitised | 84 |
Date of earliest content | 1902 |
Date of latest content | 1902 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1146999 | SP11/26 | A1 | William Ah Bow | 1902 - 1902 | 1902-01-01 00:00:00 | 1902-01-01 00:00:00 | Open | Sydney | False | 0 |
1 | 1511061 | SP11/26 | A2/69 | Ah Mung, Fong Tung | 1902 - 1902 | 1902-01-01 00:00:00 | 1902-01-01 00:00:00 | Open | Sydney | False | 0 |
2 | 1511086 | SP11/26 | A3/70123 | Bez Mahomet, Zareen, Doz Mahomet, Adam Khan [4 Afghans] | 1902 - 1902 | 1902-01-01 00:00:00 | 1902-01-01 00:00:00 | Open | Sydney | False | 0 |
3 | 1511100 | SP11/26 | A4/74 | Ah Kee | 1902 - 1902 | 1902-01-01 00:00:00 | 1902-01-01 00:00:00 | Open | Sydney | False | 0 |
4 | 1511125 | SP11/26 | A5/54 | Ah Chong | 1902 - 1902 | 1902-01-01 00:00:00 | 1902-01-01 00:00:00 | Open | Sydney | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
1 | ah | 9 |
31 | jimmy | 2 |
37 | kum | 2 |
21 | sing | 2 |
49 | lee | 2 |
7 | mahomet | 2 |
28 | wing | 2 |
5 | tung | 2 |
4 | fong | 2 |
36 | yee | 2 |
38 | mock | 1 |
46 | foon | 1 |
39 | h | 1 |
40 | w | 1 |
41 | g | 1 |
42 | ky | 1 |
43 | ling | 1 |
44 | kday | 1 |
45 | loo | 1 |
48 | myerson | 1 |
47 | leon | 1 |
34 | james | 1 |
50 | shing | 1 |
51 | low | 1 |
52 | soo | 1 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
/Users/tim/mycode/naa-data-wap/lib/python3.6/site-packages/pandas/io/formats/style.py:939: RuntimeWarning: divide by zero encountered in long_scalars
ngram | count | |
---|---|---|
0 | doz mahomet | 1 |
1 | fong tung | 1 |
2 | ling kday | 1 |
3 | zareen doz | 1 |
4 | loo foon | 1 |
5 | fong sing | 1 |
6 | mahomet zareen | 1 |
7 | hack walter | 1 |
8 | sing bedi | 1 |
9 | w g | 1 |
10 | mar hong | 1 |
11 | yee kum | 1 |
12 | myerson lee | 1 |
13 | tung hack | 1 |
14 | shing low | 1 |
15 | ah wing | 1 |
16 | loong sing | 1 |
17 | ah bow | 1 |
18 | billy loong | 1 |
19 | jimmy chuck | 1 |
20 | ah kee | 1 |
21 | bedi joseph | 1 |
22 | kee ah | 1 |
23 | ky ling | 1 |
24 | low kum | 1 |