series = 'A6119'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 6,741 |
---|---|
Access status | |
Open with exception | 6,314 (93.67%) |
Not yet examined | 363 (5.38%) |
Open | 43 (0.64%) |
Closed | 20 (0.30%) |
Withheld pending agency advice | 1 (0.01%) |
Number of items digitised | 2,320 (34.42%) |
Number of pages digitised | 258,547 |
Date of earliest content | 1852 |
Date of latest content | 2009 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 240734 | A6119 | 13 ATTACHMENT | BURCHETT, Wilfred Graham - Monitoring reports ex UNC. | 1951 - 1953 | 1951-01-01 00:00:00 | 1953-01-01 00:00:00 | Open | Canberra | True | 209 |
1 | 273730 | A6119 | 42 | THROSSELL, Katharine [Katherine Susannah] nee PRICHARD [PRITCHARD] - Volume 1 | 1919 - 1940 | 1919-01-01 00:00:00 | 1940-01-01 00:00:00 | Open with exception | Canberra | True | 127 |
2 | 276650 | A6119 | 44 PART 1 | THROSSELL, Katharine [Katherine] Susannah nee PRICHARD [PRITCHARD] - Volume 3 Part 1 (folios 1 to 20) | 1945 - 1948 | 1945-01-01 00:00:00 | 1948-01-01 00:00:00 | Open with exception | Canberra | True | 27 |
3 | 279177 | A6119 | 43 | THROSSELL, Katharine [Katherine] Susannah nee PRICHARD [PRITCHARD] - Volume 2 | 1941 - 1952 | 1941-01-01 00:00:00 | 1952-01-01 00:00:00 | Open with exception | Canberra | True | 170 |
4 | 332779 | A6119 | 260 | TERKES Ferdo Formerlu Pijevic Joso | 1954 - 1955 | 1954-01-01 00:00:00 | 1955-01-01 00:00:00 | Open with exception | Canberra | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
14 | volume | 4,768 |
15 | 1 | 1,590 |
20 | 2 | 937 |
49 | aka | 752 |
32 | john | 729 |
16 | 3 | 532 |
112 | 4 | 376 |
259 | papers | 350 |
168 | william | 343 |
258 | miscellaneous | 327 |
493 | 5 | 278 |
137 | james | 277 |
30 | george | 275 |
11 | nee | 228 |
705 | aarons | 219 |
488 | 6 | 214 |
233 | robert | 206 |
129 | francis | 182 |
236 | alexander | 170 |
487 | 7 | 164 |
288 | joseph | 159 |
171 | david | 158 |
385 | michael | 154 |
130 | edward | 151 |
152 | charles | 139 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | volume 1 | 1,483 |
1 | volume 2 | 848 |
2 | volume 3 | 496 |
3 | volume 4 | 354 |
4 | miscellaneous papers | 320 |
5 | volume 5 | 265 |
6 | john volume | 236 |
7 | volume 6 | 200 |
8 | volume 7 | 157 |
9 | volume 8 | 122 |
10 | volume 9 | 102 |
11 | asio file | 96 |
12 | laurence volume | 90 |
13 | francis volume | 89 |
14 | volume 10 | 88 |
15 | aarons laurence | 87 |
16 | george volume | 79 |
17 | albert volume | 77 |
18 | william volume | 76 |
19 | volume 11 | 72 |
20 | volume 12 | 65 |
21 | david volume | 62 |
22 | james volume | 61 |
23 | robert volume | 57 |
24 | wilton john | 55 |