series = 'A6281'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 17 |
---|---|
Access status | |
Open | 11 (64.71%) |
Not yet examined | 5 (29.41%) |
Open with exception | 1 (5.88%) |
Number of items digitised | 0 (0.00%) |
Number of pages digitised | 0 |
Date of earliest content | None |
Date of latest content | nan |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 13187837 | A6281 | 1042788 | Conversation Between Dr Michael Bialoguski, Vladimir Petrov And Ron Richards. Conversation Between Dr Michael Bialoguski And Ron Richards. Conversation Between Dr Bialoguski And Vladimir Petrov. - Primary Version | circa1954 - circa1954 | NaT | NaT | Open | Various locations | False | 0 |
1 | 13187838 | A6281 | 1042796 | Conversation between Dr Michael Bialoguski, Vladimir Petrov and Ron Richards. Conversation between Vladimir Petrov and Ron Richards - Primary Version | circa1954 - circa1954 | NaT | NaT | Open | Various locations | False | 0 |
2 | 13187840 | A6281 | 1042945 | Parliamentary Debate On The Royal Commission On Espionage [House Of Representatives, 25 October 1955 - Rg Menzies - Part 2] - Primary Version | circa1955 - circa1955 | NaT | NaT | Open | Various locations | False | 0 |
3 | 13187841 | A6281 | 1042955 | Parliamentary Debate On The Royal Commission On Espionage [House Of Representatives, 25 October 1955 - Ej Ward - Part 2 And Sm Keon] - Primary Version | circa1955 - circa1955 | NaT | NaT | Not yet examined | Various locations | False | 0 |
4 | 13187854 | A6281 | 1042808 | Conversation between Vladimir Petrov and Ron Richards - Primary Version | circa1954 - circa1954 | NaT | NaT | Open | Various locations | False | 0 |
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
8 | primary | 17 |
9 | version | 17 |
0 | conversation | 11 |
19 | 1955 | 11 |
16 | representatives | 11 |
15 | house | 11 |
5 | petrov | 11 |
11 | debate | 10 |
14 | espionage | 10 |
13 | commission | 10 |
12 | royal | 10 |
18 | october | 10 |
10 | parliamentary | 10 |
7 | richards | 10 |
22 | part | 9 |
6 | ron | 8 |
4 | vladimir | 8 |
17 | 25 | 7 |
1 | dr | 5 |
3 | bialoguski | 5 |
23 | 2 | 5 |
2 | michael | 4 |
21 | menzies | 4 |
35 | 19 | 3 |
39 | bourke | 3 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | primary version | 17 |
1 | of representatives | 11 |
2 | house of | 11 |
3 | conversation between | 11 |
4 | commission on | 10 |
5 | the royal | 10 |
6 | version parliamentary | 10 |
7 | october 1955 | 10 |
8 | on espionage | 10 |
9 | debate on | 10 |
10 | on the | 10 |
11 | royal commission | 10 |
12 | parliamentary debate | 10 |
13 | espionage house | 9 |
14 | vladimir petrov | 8 |
15 | ron richards | 8 |
16 | representatives 25 | 7 |
17 | and ron | 6 |
18 | 25 october | 6 |
19 | part 2 | 5 |
20 | petrov and | 5 |
21 | between dr | 5 |
22 | version conversation | 5 |
23 | michael bialoguski | 4 |
24 | dr michael | 4 |