series = 'A6122'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 2,819 |
---|---|
Access status | |
Open with exception | 2,376 (84.29%) |
Open | 162 (5.75%) |
Closed | 138 (4.90%) |
Not yet examined | 137 (4.86%) |
Withheld pending agency advice | 6 (0.21%) |
Number of items digitised | 565 (20.04%) |
Number of pages digitised | 69,007 |
Date of earliest content | 1800 |
Date of latest content | 1993 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 217109 | A6122 | 124 | Comintern [Communist International] [includes papers on The League against Cruelties and Oppression in the Colonies] File ends 1966. | 1927 - 1966 | 1927-01-01 00:00:00 | 1966-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
1 | 217110 | A6122 | 125 | Left Book Club | 1938 - 1941 | 1938-01-01 00:00:00 | 1941-01-01 00:00:00 | Open | Canberra | False | 0 |
2 | 217111 | A6122 | 126 | Industrial Workers of the World Volume 1 Part 1 | 1916 - 1948 | 1916-01-01 00:00:00 | 1948-01-01 00:00:00 | Open with exception | Canberra | True | 253 |
3 | 217113 | A6122 | 127 | Textile Workers Militant Committee | 1944 - 1944 | 1944-01-01 00:00:00 | 1944-01-01 00:00:00 | Open | Canberra | False | 0 |
4 | 217114 | A6122 | 128 | Irish Republican Army | 1940 - 1944 | 1940-01-01 00:00:00 | 1944-01-01 00:00:00 | Open | Canberra | True | 63 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
42 | australia | 1,339 |
18 | volume | 1,277 |
45 | party | 1,078 |
1 | communist | 1,061 |
19 | 1 | 456 |
44 | cpa | 418 |
102 | australian | 359 |
279 | branch | 334 |
36 | south | 308 |
64 | 2 | 278 |
35 | new | 275 |
226 | nsw | 231 |
204 | interest | 222 |
202 | victoria | 195 |
37 | wales | 182 |
46 | queensland | 167 |
193 | 3 | 155 |
114 | asio | 153 |
317 | associations | 146 |
268 | cp | 131 |
339 | 4 | 117 |
34 | council | 114 |
39 | union | 107 |
47 | association | 106 |
43 | general | 104 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | communist party | 1,019 |
1 | of australia | 1,016 |
2 | party of | 973 |
3 | volume 1 | 398 |
4 | cpa communist | 250 |
5 | volume 2 | 244 |
6 | interest in | 211 |
7 | new south | 182 |
8 | south wales | 182 |
9 | of a | 160 |
10 | a communist | 145 |
11 | volume 3 | 141 |
12 | cp of | 131 |
13 | branch communist | 118 |
14 | volume 4 | 109 |
15 | associations individual | 95 |
16 | australia interest | 88 |
17 | south australia | 87 |
18 | australia new | 83 |
19 | australia queensland | 79 |
20 | australia nsw | 77 |
21 | of the | 76 |
22 | western australia | 76 |
23 | australia victoria | 72 |
24 | nsw volume | 71 |