series = 'SP115/10'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 6 |
---|---|
Access status | |
Open | 6 (100.00%) |
Number of items digitised | 0 (0.00%) |
Number of pages digitised | 0 |
Date of earliest content | 1884 |
Date of latest content | 1888 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1826044 | SP115/10 | WHOLE SERIES | Certificates Exempting from the provisions of 'The Influx of Chinese Restriction Act 1881' | 1884 - 1888 | 1884-01-01 00:00:00 | 1888-01-01 00:00:00 | Open | Sydney | False | 0 |
1 | 11014873 | SP115/10 | 804 | Ah Luck [box 1] | 1884 - 1884 | 1884-01-01 00:00:00 | 1884-01-01 00:00:00 | Open | Sydney | False | 0 |
2 | 12145978 | SP115/10 | 800 | Ah See [box 1] | 1884 - 1884 | 1884-01-01 00:00:00 | 1884-01-01 00:00:00 | Open | Sydney | False | 0 |
3 | 12187508 | SP115/10 | 1092 | Ah Gee [box 1] | 1885 - 1885 | 1885-01-01 00:00:00 | 1885-01-01 00:00:00 | Open | Sydney | False | 0 |
4 | 12255210 | SP115/10 | 1643 | Ah Kum [box 1] | 1886 - 1886 | 1886-01-01 00:00:00 | 1886-01-01 00:00:00 | Open | Sydney | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
8 | ah | 5 |
10 | box | 5 |
11 | 1 | 5 |
0 | certificates | 1 |
1 | exempting | 1 |
2 | provisions | 1 |
3 | influx | 1 |
4 | chinese | 1 |
5 | restriction | 1 |
6 | act | 1 |
7 | 1881 | 1 |
9 | luck | 1 |
12 | see | 1 |
13 | gee | 1 |
14 | kum | 1 |
15 | gow | 1 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | box 1 | 5 |
1 | 1 ah | 4 |
2 | restriction act | 1 |
3 | ah see | 1 |
4 | certificates exempting | 1 |
5 | act 1881 | 1 |
6 | from the | 1 |
7 | ah luck | 1 |
8 | ah gee | 1 |
9 | gow box | 1 |
10 | luck box | 1 |
11 | influx of | 1 |
12 | see box | 1 |
13 | the provisions | 1 |
14 | ah kum | 1 |
15 | ah gow | 1 |
16 | provisions of | 1 |
17 | 'the influx | 1 |
18 | chinese restriction | 1 |
19 | of chinese | 1 |
20 | kum box | 1 |
21 | gee box | 1 |
22 | 1881 ah | 1 |
23 | exempting from | 1 |
24 | of 'the | 1 |