series = 'A6282'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 14 |
---|---|
Access status | |
Open | 13 (92.86%) |
Open with exception | 1 (7.14%) |
Number of items digitised | 2 (14.29%) |
Number of pages digitised | 328 |
Date of earliest content | 1954 |
Date of latest content | 1956 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4185721 | A6282 | 1 | [Folders of newspaper cuttings relating to the Royal Commission on Espionage] 14 April 1954 to 22 April 1954 | 1954 - 1954 | 1954-01-01 00:00:00 | 1954-01-01 00:00:00 | Open | Canberra | False | 0 |
1 | 4185722 | A6282 | 2 | [Folders of newspaper cuttings relating to the Royal Commission on Espionage] 23 April 1954 to 30 April 1954 | 1954 - 1954 | 1954-01-01 00:00:00 | 1954-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
2 | 4185723 | A6282 | 3 | [Folders of newspaper cuttings relating to the Royal Commission on Espionage] 1 May 1954 to 20 May 1954 | 1954 - 1954 | 1954-01-01 00:00:00 | 1954-01-01 00:00:00 | Open | Canberra | False | 0 |
3 | 4185724 | A6282 | 4 | [Folders of newspaper cuttings relating to the Royal Commission on Espionage] 23 May 1954 to 10 July 1954 | 1954 - 1954 | 1954-01-01 00:00:00 | 1954-01-01 00:00:00 | Open | Canberra | True | 262 |
4 | 4185725 | A6282 | 5 | [Folders of newspaper cuttings relating to the Royal Commission on Espionage] 11 July 1954 to 14 August 1954 | 1954 - 1954 | 1954-01-01 00:00:00 | 1954-01-01 00:00:00 | Open | Canberra | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
9 | 1954 | 18 |
2 | cuttings | 14 |
0 | folders | 13 |
3 | relating | 13 |
4 | royal | 13 |
5 | commission | 13 |
6 | espionage | 13 |
1 | newspaper | 13 |
32 | 1955 | 5 |
22 | september | 5 |
8 | april | 4 |
14 | may | 3 |
19 | august | 3 |
27 | october | 2 |
20 | 15 | 2 |
35 | 12 | 2 |
11 | 23 | 2 |
24 | 16 | 2 |
23 | 5 | 2 |
7 | 14 | 2 |
33 | february | 2 |
17 | july | 2 |
13 | 1 | 2 |
40 | press | 1 |
41 | south | 1 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | of newspaper | 13 |
1 | relating to | 13 |
2 | newspaper cuttings | 13 |
3 | cuttings relating | 13 |
4 | commission on | 13 |
5 | folders of | 13 |
6 | to the | 13 |
7 | on espionage | 13 |
8 | royal commission | 13 |
9 | the royal | 13 |
10 | 1954 to | 9 |
11 | 1954 folders | 9 |
12 | april 1954 | 4 |
13 | september 1954 | 4 |
14 | may 1954 | 3 |
15 | 1955 to | 3 |
16 | august 1954 | 2 |
17 | espionage 1 | 2 |
18 | espionage 5 | 2 |
19 | espionage 23 | 2 |
20 | july 1954 | 2 |
21 | february 1955 | 2 |
22 | 1955 folders | 2 |
23 | october 1954 | 2 |
24 | 14 april | 1 |