series = 'B2836'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 14 |
---|---|
Access status | |
Open | 14 (100.00%) |
Number of items digitised | 3 (21.43%) |
Number of pages digitised | 375 |
Date of earliest content | 1926 |
Date of latest content | 1972 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 412483 | B2836 | GROUP 62 | Peace Publications (Three) | 1950 - 1951 | 1950-01-01 00:00:00 | 1951-01-01 00:00:00 | Open | Melbourne | False | 0 |
1 | 412493 | B2836 | GROUP 64 | Publications of the Australia - Soviet Friendship League (35 Leaflets etc) | 1940 - 1944 | 1940-01-01 00:00:00 | 1944-01-01 00:00:00 | Open | Melbourne | False | 0 |
2 | 412501 | B2836 | GROUP 59/4 | A.C.P. Publications | 1932 - 1952 | 1932-01-01 00:00:00 | 1952-01-01 00:00:00 | Open | Melbourne | False | 0 |
3 | 412505 | B2836 | GROUP 60 | CPA Publications and Others (110 Leaflets etc) | circa1952 - 1953 | NaT | 1953-01-01 00:00:00 | Open | Melbourne | True | 264 |
4 | 412512 | B2836 | GROUP 53 PART 1 | "Workers Star" (Nos 129-163, with Gaps) | 1939 - 1939 | 1939-01-01 00:00:00 | 1939-01-01 00:00:00 | Open | Melbourne | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
1 | publications | 6 |
8 | leaflets | 6 |
9 | etc | 6 |
11 | cpa | 4 |
15 | star | 4 |
3 | australia | 3 |
22 | issues | 3 |
14 | workers | 3 |
27 | communist | 2 |
16 | nos | 2 |
28 | party | 2 |
21 | 10 | 2 |
7 | 35 | 2 |
29 | 56 | 1 |
30 | 48 | 1 |
31 | campaign | 1 |
0 | peace | 1 |
32 | 1951 | 1 |
26 | 51 | 1 |
34 | publictions | 1 |
35 | 83 | 1 |
36 | reference | 1 |
37 | material | 1 |
38 | accumulated | 1 |
33 | referendum | 1 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | leaflets etc | 6 |
1 | workers star | 3 |
2 | 35 leaflets | 2 |
3 | other publications | 2 |
4 | australia and | 2 |
5 | etc cpa | 2 |
6 | and other | 2 |
7 | party of | 2 |
8 | star nos | 2 |
9 | cpa communist | 2 |
10 | communist party | 2 |
11 | star 10 | 2 |
12 | 10 issues | 2 |
13 | of australia | 2 |
14 | 48 leaflets | 1 |
15 | publictions 83 | 1 |
16 | of the | 1 |
17 | 129-163 with | 1 |
18 | others 110 | 1 |
19 | by agents | 1 |
20 | material accumulated | 1 |
21 | the workers | 1 |
22 | friendship league | 1 |
23 | campaign 1951 | 1 |
24 | cpa publictions | 1 |