series = 'SP11/6'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 191 |
---|---|
Access status | |
Open | 101 (52.88%) |
Not yet examined | 90 (47.12%) |
Number of items digitised | 1 (0.52%) |
Number of pages digitised | 323 |
Date of earliest content | 1902 |
Date of latest content | 1947 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 692978 | SP11/6 | NN | Certificate Exempting From Dictation Test, Immigration Act 1901-1925: Asian passengers per the SS Aki Maru Sydney 13/5/27 [BOX 1] | 1927 - 1927 | 1927-01-01 00:00:00 | 1927-01-01 00:00:00 | Open | Sydney | False | 0 |
1 | 692979 | SP11/6 | NN | Certificate Exempting From Dictation Test, Immigration Act 1901-1925: Chinese passengers per the SS Arafura Sydney 21/5/27 [BOX 1] | 1927 - 1927 | 1927-01-01 00:00:00 | 1927-01-01 00:00:00 | Open | Sydney | False | 0 |
2 | 692980 | SP11/6 | NN | Certificate Exempting From Dictation Test, Immigration Act 1901-1925: Chinese passengers per the SS Changte Sydney 04/5/27 [BOX 1] | 1927 - 1927 | 1927-01-01 00:00:00 | 1927-01-01 00:00:00 | Open | Sydney | False | 0 |
3 | 692981 | SP11/6 | NN | Certificate Exempting From Dictation Test, Immigration Act 1901-1925: Chinese passengers per the SS St Albans Sydney 22/4/27 [BOX 1] | 1927 - 1927 | 1927-01-01 00:00:00 | 1927-01-01 00:00:00 | Open | Sydney | False | 0 |
4 | 692982 | SP11/6 | 1 | Chinese Sydney Taiping, 6 April 1927 | 1927 - 1927 | 1927-01-01 00:00:00 | 1927-01-01 00:00:00 | Open | Sydney | True | 323 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
13 | sydney | 163 |
17 | chinese | 132 |
15 | box | 93 |
0 | certificate | 90 |
4 | immigration | 88 |
5 | act | 88 |
2 | dictation | 86 |
3 | test | 86 |
1 | exempting | 85 |
9 | per | 85 |
10 | ss | 84 |
8 | passengers | 83 |
6 | 1901-1925 | 57 |
28 | 1927 | 52 |
12 | maru | 40 |
41 | indian | 33 |
142 | 1901 | 31 |
143 | 1925 | 31 |
114 | 1926 | 29 |
145 | 5 | 24 |
75 | 2 | 22 |
20 | changte | 20 |
16 | 1 | 19 |
60 | tanda | 17 |
25 | taiping | 17 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | immigration act | 88 |
1 | from dictation | 86 |
2 | dictation test | 86 |
3 | certificate exempting | 85 |
4 | test immigration | 85 |
5 | exempting from | 85 |
6 | passengers per | 81 |
7 | per ss | 72 |
8 | chinese passengers | 62 |
9 | chinese sydney | 59 |
10 | act 1901-1925 | 57 |
11 | at sydney | 52 |
12 | 1901-1925 chinese | 44 |
13 | act 1901 | 31 |
14 | 1901 1925 | 31 |
15 | 1927 chinese | 31 |
16 | 1925 chinese | 21 |
17 | 1926 chinese | 20 |
18 | 2 certificate | 19 |
19 | box 2 | 19 |
20 | box 5 | 19 |
21 | indian sydney | 17 |
22 | box 1 | 17 |
23 | 5 certificate | 16 |
24 | st albans | 16 |