series = 'A9626'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 1,075 |
---|---|
Access status | |
Open | 792 (73.67%) |
Open with exception | 277 (25.77%) |
Not yet examined | 6 (0.56%) |
Number of items digitised | 570 (53.02%) |
Number of pages digitised | 9,370 |
Date of earliest content | 1919 |
Date of latest content | 1998 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1188387 | A9626 | 1 | ASIO surveillance photograph of James Frederick Hill [former Department of External Affairs diplomat] | circa1950 - circa1950 | NaT | NaT | Open | Canberra | False | 0 |
1 | 1188390 | A9626 | 2 | ASIO surveillance photograph of James Frederick Hill [former Department of External Affairs diplomat] | circa1950 - circa1950 | NaT | NaT | Open | Canberra | False | 0 |
2 | 1188393 | A9626 | 3 | ASIO surveillance photograph of James Frederick Hill [former Department of External Affairs diplomat] | circa1950 - circa1950 | NaT | NaT | Open | Canberra | True | 1 |
3 | 1188395 | A9626 | 4 | ASIO surveillance photograph of James Frederick Hill [former Department of External Affairs diplomat] | circa1950 - circa1950 | NaT | NaT | Open | Canberra | False | 0 |
4 | 1188399 | A9626 | 5 | ASIO photograph of John Wear Burton [former Secretary of Department of External Affairs] | circa1950 - circa1950 | NaT | NaT | Open | Canberra | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
0 | asio | 246 |
1 | surveillance | 243 |
2 | photograph | 237 |
174 | robinson | 161 |
198 | aka | 100 |
11 | john | 91 |
3 | james | 70 |
30 | william | 67 |
173 | albert | 56 |
175 | eva | 51 |
101 | march | 43 |
440 | number | 41 |
187 | george | 39 |
80 | may | 33 |
178 | max | 31 |
82 | sydney | 30 |
146 | demonstration | 29 |
208 | robert | 28 |
121 | david | 28 |
179 | ernest | 28 |
162 | photographs | 28 |
131 | doreen | 28 |
81 | day | 26 |
195 | nee | 25 |
153 | canberra | 23 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | asio surveillance | 237 |
1 | surveillance photograph | 228 |
2 | photograph of | 208 |
3 | robinson asio | 119 |
4 | albert robinson | 51 |
5 | eva robinson | 50 |
6 | of eva | 48 |
7 | of albert | 48 |
8 | william robinson | 29 |
9 | photograph james | 28 |
10 | james william | 28 |
11 | ernest robinson | 25 |
12 | max ernest | 25 |
13 | of max | 25 |
14 | may day | 24 |
15 | day march | 23 |
16 | at the | 15 |
17 | scan of | 13 |
18 | resolution scan | 13 |
19 | high resolution | 13 |
20 | of page | 13 |
21 | doreen burrow | 12 |
22 | of doreen | 12 |
23 | burrow at | 12 |
24 | zangalis george | 11 |