series = 'A6283'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 256 |
---|---|
Access status | |
Open with exception | 208 (81.25%) |
Not yet examined | 24 (9.38%) |
Open | 21 (8.20%) |
Closed | 3 (1.17%) |
Number of items digitised | 23 (8.98%) |
Number of pages digitised | 3,352 |
Date of earliest content | 1800 |
Date of latest content | 1959 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 241384 | A6283 | 18 | Communist Party of Australia | 1954 - 1955 | 1954-01-01 00:00:00 | 1955-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
1 | 241385 | A6283 | 19 | Voks | 1954 - 1955 | 1954-01-01 00:00:00 | 1955-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
2 | 241388 | A6283 | 24 | Australian Government Policies | 1954 - 1954 | 1954-01-01 00:00:00 | 1954-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
3 | 241389 | A6283 | 25 | Australian Government Departments and Instrumentalities | 1954 - 1955 | 1954-01-01 00:00:00 | 1955-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
4 | 241390 | A6283 | 27 | Corrective Labour Camps | 1954 - 1954 | 1954-01-01 00:00:00 | 1954-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
183 | copy | 96 |
182 | reference | 96 |
33 | volume | 55 |
40 | petrov | 50 |
35 | vol | 41 |
49 | vladimir | 37 |
78 | royal | 29 |
47 | press | 28 |
38 | 5 | 27 |
79 | commission | 27 |
29 | interrogations | 27 |
48 | cuttings | 26 |
30 | petrovs | 25 |
41 | 1 | 23 |
46 | 2 | 22 |
50 | mihailovich | 21 |
82 | mvd | 21 |
56 | evdokia | 20 |
42 | overseas | 19 |
28 | reports | 19 |
67 | intelligence | 17 |
68 | services | 17 |
52 | story | 16 |
226 | 16 | |
228 | september | 16 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | reference copy | 96 |
1 | of interrogations | 27 |
2 | royal commission | 27 |
3 | press cuttings | 26 |
4 | of petrovs | 24 |
5 | cuttings volume | 24 |
6 | petrov vladimir | 22 |
7 | volume 5 | 22 |
8 | vladimir mihailovich | 21 |
9 | interrogations of | 20 |
10 | overseas intelligence | 17 |
11 | intelligence services | 17 |
12 | 5 'the | 16 |
13 | reports of | 16 |
14 | 'the courier | 14 |
15 | by vladimir | 14 |
16 | evdokia petrov | 14 |
17 | courier mail | 14 |
18 | empire of | 13 |
19 | the empire | 13 |
20 | vladimir and | 13 |
21 | and evdokia | 13 |
22 | of fear | 13 |
23 | 1955 p2 | 12 |
24 | vol 1 | 11 |