series = 'A12694'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 25 |
---|---|
Access status | |
Open with exception | 20 (80.00%) |
Open | 5 (20.00%) |
Number of items digitised | 8 (32.00%) |
Number of pages digitised | 669 |
Date of earliest content | 1965 |
Date of latest content | 1986 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7949343 | A12694 | 8 | Directors and Regional Directors Conferences from 1970 | 1970 - 1973 | 1970-01-01 00:00:00 | 1973-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
1 | 7949625 | A12694 | 9 | Throssell, Richard Prichard Volume 4 | 1965 - 1969 | 1965-01-01 00:00:00 | 1969-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
2 | 7949626 | A12694 | 10 | Throssell, Richard Prichard Volume 5 | 1971 - 1974 | 1971-01-01 00:00:00 | 1974-01-01 00:00:00 | Open with exception | Canberra | True | 70 |
3 | 7949627 | A12694 | 11 | Policy or directives about the employment of homosexuals Volume 2 | 1969 - 1969 | 1969-01-01 00:00:00 | 1969-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
4 | 7949628 | A12694 | 12 | Policy or directives about the employment of homosexuals Volume 3 | 1970 - 1971 | 1970-01-01 00:00:00 | 1971-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
7 | volume | 16 |
40 | papers | 6 |
39 | miscellaneous | 6 |
14 | 2 | 5 |
26 | 1 | 4 |
30 | soviet | 4 |
31 | embassy | 4 |
48 | australia | 4 |
32 | contact | 4 |
33 | members | 4 |
34 | parliament | 4 |
28 | projects | 3 |
29 | branch | 3 |
15 | 3 | 3 |
13 | homosexuals | 3 |
27 | special | 3 |
12 | employment | 3 |
11 | directives | 3 |
10 | policy | 3 |
8 | 4 | 3 |
61 | communist | 3 |
62 | party | 3 |
38 | intelligence | 2 |
36 | briefings | 2 |
67 | organisation | 2 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | miscellaneous papers | 6 |
1 | volume 2 | 5 |
2 | contact with | 4 |
3 | volume 1 | 4 |
4 | of australia | 4 |
5 | embassy contact | 4 |
6 | parliament volume | 4 |
7 | members of | 4 |
8 | of parliament | 4 |
9 | soviet embassy | 4 |
10 | with members | 4 |
11 | branch volume | 3 |
12 | volume 4 | 3 |
13 | policy or | 3 |
14 | the employment | 3 |
15 | special projects | 3 |
16 | communist party | 3 |
17 | projects branch | 3 |
18 | or directives | 3 |
19 | of homosexuals | 3 |
20 | directives about | 3 |
21 | homosexuals volume | 3 |
22 | employment of | 3 |
23 | party of | 3 |
24 | volume 3 | 3 |