series = 'A9108'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 691 |
---|---|
Access status | |
Open with exception | 465 (67.29%) |
Open | 220 (31.84%) |
Closed | 4 (0.58%) |
Not yet examined | 2 (0.29%) |
Number of items digitised | 107 (15.48%) |
Number of pages digitised | 9,810 |
Date of earliest content | 1920 |
Date of latest content | 1967 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 506074 | A9108 | ROLL 20/28 | Communist meetings in Western Australia | 1945 - 1949 | 1945-01-01 00:00:00 | 1949-01-01 00:00:00 | Open | Canberra | False | 0 |
1 | 1353596 | A9108 | ROLL 20/39 | Netherlands East Indies - Military detention camp - Casino [176pp] | 1945 - 1947 | 1945-01-01 00:00:00 | 1947-01-01 00:00:00 | Open | Canberra | True | 199 |
2 | 1353598 | A9108 | ROLL 20/45 | Chinese communists - General activities in Australia | 1948 - 1951 | 1948-01-01 00:00:00 | 1951-01-01 00:00:00 | Open with exception | Canberra | True | 145 |
3 | 1353599 | A9108 | ROLL 20/27 | Communistic direct action and demonstration in South Australia | 1949 - 1949 | 1949-01-01 00:00:00 | 1949-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
4 | 1353600 | A9108 | ROLL 20/46 | Chinese Consulate - Activities | 1950 - 1951 | 1950-01-01 00:00:00 | 1951-01-01 00:00:00 | Open with exception | Canberra | True | 9 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
0 | communist | 147 |
256 | pages | 139 |
51 | party | 133 |
101 | australian | 127 |
3 | australia | 83 |
356 | 0.5cm | 70 |
352 | 1cm | 60 |
15 | activities | 39 |
22 | communism | 32 |
1053 | ship | 31 |
20 | south | 28 |
743 | 0.25cm | 28 |
655 | 2cm | 25 |
64 | organisations | 23 |
156 | 1952 | 23 |
78 | new | 22 |
23 | trade | 21 |
320 | committee | 20 |
639 | nazi | 20 |
578 | movement | 18 |
260 | 4 | 18 |
310 | service | 18 |
12 | chinese | 18 |
63 | association | 17 |
342 | league | 17 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | communist party | 116 |
1 | australian communist | 99 |
2 | 0.5cm australian | 30 |
3 | in australia | 26 |
4 | communism in | 25 |
5 | south australia | 21 |
6 | of australia | 21 |
7 | party of | 19 |
8 | pages australian | 18 |
9 | 4 pages | 18 |
10 | activities in | 17 |
11 | 8 pages | 15 |
12 | in the | 15 |
13 | in south | 13 |
14 | 2 pages | 13 |
15 | for peace | 12 |
16 | pty ltd | 12 |
17 | trade unions | 12 |
18 | 1cm australian | 12 |
19 | 5 pages | 11 |
20 | 7 pages | 11 |
21 | 10 pages | 11 |
22 | in trade | 11 |
23 | in victoria | 11 |
24 | 3 pages | 10 |