series = 'A13828'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 12 |
---|---|
Access status | |
Not yet examined | 9 (75.00%) |
Open | 3 (25.00%) |
Number of items digitised | 0 (0.00%) |
Number of pages digitised | 0 |
Date of earliest content | 1955 |
Date of latest content | 1974 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30342357 | A13828 | 32/1/65.2 | TITLE: Ceskoslovensky Filmovy Tydenik - Czechoslovak Film Weekly Magazine. Number 35 1968, Special Edition 1968 and Number 36 1968 [Papers containing information related to the file item - script] | 1968 - 1968 | 1968-01-01 00:00:00 | 1968-01-01 00:00:00 | Not yet examined | Sydney | False | 0 |
1 | 30342399 | A13828 | 32/1/4.3 | TITLE: The Lecture [Paper items relating to the file item - Script] | circa1962 - circa1962 | NaT | NaT | Not yet examined | Sydney | False | 0 |
2 | 60089626 | A13828 | 32/1/49 Volume 2 | TITLE: Communist Party of Australia [CPA] South Australia State Conference - Adelaide 18 - 19 March 1972 [Papers containing information related to audiovisual item - Script] | 1972 - 1972 | 1972-01-01 00:00:00 | 1972-01-01 00:00:00 | Not yet examined | Sydney | False | 0 |
3 | 60089627 | A13828 | 32/1/56 Volume 2 | TITLE: State Conference Communist Party of Australia [CPA] Adelaide - 1974 [Papers containing information related to audiovisual item - Script] | 1974 - 1974 | 1974-01-01 00:00:00 | 1974-01-01 00:00:00 | Open | Sydney | False | 0 |
4 | 60089628 | A13828 | 32/1/61 Volume 2 | TITLE: South Australian State Conference 28 September 1968 Day 1 [Papers containing information related to audiovisual item - Shot List] | 1968 - 1968 | 1968-01-01 00:00:00 | 1968-01-01 00:00:00 | Open | Sydney | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
0 | title | 12 |
19 | item | 12 |
16 | information | 11 |
15 | containing | 11 |
14 | papers | 11 |
17 | related | 11 |
37 | audiovisual | 10 |
31 | conference | 9 |
30 | state | 8 |
29 | south | 7 |
44 | shot | 6 |
45 | list | 6 |
20 | script | 5 |
10 | 1968 | 5 |
27 | australia | 5 |
39 | australian | 5 |
42 | day | 4 |
28 | cpa | 3 |
35 | march | 3 |
25 | communist | 3 |
41 | september | 3 |
26 | party | 3 |
43 | 1 | 2 |
47 | 2 | 2 |
50 | 1970 | 2 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | related to | 11 |
1 | papers containing | 11 |
2 | containing information | 11 |
3 | information related | 11 |
4 | audiovisual item | 10 |
5 | to audiovisual | 10 |
6 | state conference | 8 |
7 | list title | 6 |
8 | item shot | 6 |
9 | shot list | 6 |
10 | australian state | 5 |
11 | south australian | 5 |
12 | item script | 5 |
13 | title south | 5 |
14 | script title | 4 |
15 | communist party | 3 |
16 | australia cpa | 3 |
17 | of australia | 3 |
18 | party of | 3 |
19 | 2 papers | 2 |
20 | 1 papers | 2 |
21 | march 1970 | 2 |
22 | september 1968 | 2 |
23 | the file | 2 |
24 | 1968 day | 2 |