series = 'A8703'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 641 |
---|---|
Access status | |
Open | 328 (51.17%) |
Not yet examined | 313 (48.83%) |
Number of items digitised | 0 (0.00%) |
Number of pages digitised | 0 |
Date of earliest content | 1937 |
Date of latest content | 1980 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 13167211 | A8703 | 1002872 | Laurie Arons General Secretary Communist Party of Australia [CPA] - Wages claim circa 1960s 2. Gietzelt at Eric Aaron's home Sydney 30/9/1969 3. Communist Party of Australia [CPA] meeting Brisbane attended by Laurie Aarons 10/7/1971 4. Mesyatsev, Deurin and Laurie Aarons at Rushcutters Bay Sydney 6/4/1972 - Primary Version | 1969 - 1972 | 1969-01-01 00:00:00 | 1972-01-01 00:00:00 | Open | Various locations | False | 0 |
1 | 13168111 | A8703 | 1004943 | 7 Days [Episode 24] - Interview with Alec Robertson, Chairman of the Tribune - Primary Version | circa1965 - circa1965 | NaT | NaT | Open | Various locations | False | 0 |
2 | 13168130 | A8703 | 1004952 | 7 Days [Episode 6] - Dead Men on Leave and Episode 7: Disdain to Conceal - Primary Version | circa1966 - circa1966 | NaT | NaT | Not yet examined | Various locations | False | 0 |
3 | 13187503 | A8703 | 1043564 | Anti Japanese Rearmament Delegation to the Prime Minister Canberra - 27 February 1952 - Primary Version | 1952 - 1952 | 1952-01-01 00:00:00 | 1952-01-01 00:00:00 | Not yet examined | Various locations | False | 0 |
4 | 13187998 | A8703 | 1044544 | APCHOL Demonstration Melbourne 12 July 1968 - Primary Version | 1968 - 1968 | 1968-01-01 00:00:00 | 1968-01-01 00:00:00 | Open | Various locations | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
32 | version | 629 |
31 | primary | 627 |
6 | australia | 232 |
5 | party | 218 |
4 | communist | 205 |
7 | cpa | 203 |
96 | conference | 170 |
17 | sydney | 108 |
60 | melbourne | 102 |
98 | state | 101 |
66 | march | 92 |
20 | meeting | 86 |
115 | day | 83 |
114 | may | 82 |
99 | national | 71 |
21 | brisbane | 55 |
69 | 1962 | 51 |
362 | skripov | 50 |
65 | congress | 49 |
363 | case | 49 |
59 | demonstration | 46 |
101 | youth | 43 |
156 | 1961 | 40 |
123 | september | 39 |
102 | league | 38 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | primary version | 627 |
1 | party of | 207 |
2 | of australia | 206 |
3 | communist party | 205 |
4 | australia cpa | 202 |
5 | version communist | 123 |
6 | state conference | 92 |
7 | may day | 53 |
8 | skripov case | 46 |
9 | version may | 45 |
10 | national congress | 44 |
11 | day march | 44 |
12 | version skripov | 43 |
13 | 1962 primary | 40 |
14 | eureka youth | 37 |
15 | youth league | 37 |
16 | 1961 primary | 36 |
17 | league eyl | 33 |
18 | version eureka | 27 |
19 | district conference | 25 |
20 | 1970 primary | 25 |
21 | congress communist | 23 |
22 | 1957 primary | 23 |
23 | 1956 primary | 22 |
24 | 1960 primary | 21 |