series = 'SP726/1'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 6 |
---|---|
Access status | |
Open | 6 (100.00%) |
Number of items digitised | 0 (0.00%) |
Number of pages digitised | 0 |
Date of earliest content | 1902 |
Date of latest content | 1959 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12145139 | SP726/1 | BOOK 2 | Register of names relating to exemption from Dictation Tests (1911-1918) | 1911 - 1918 | 1911-01-01 00:00:00 | 1918-01-01 00:00:00 | Open | Sydney | False | 0 |
1 | 12145140 | SP726/1 | BOOK 3 | Register of names relating to exemption from Dictation Tests (1918-1925) | 1918 - 1925 | 1918-01-01 00:00:00 | 1925-01-01 00:00:00 | Open | Sydney | False | 0 |
2 | 12145522 | SP726/1 | BOOK 1 | Register of names relating to exemption from Dictation Tests (1902-1910) | 1902 - 1910 | 1902-01-01 00:00:00 | 1910-01-01 00:00:00 | Open | Sydney | False | 0 |
3 | 12154684 | SP726/1 | BOOK 4 | Register of names relating to exemption from Dictation Tests (1919-1924) | 1919 - 1924 | 1919-01-01 00:00:00 | 1924-01-01 00:00:00 | Open | Sydney | False | 0 |
4 | 12154685 | SP726/1 | BOOK 5 | Register of names relating to exemption from Dictation Tests (1925-1934) | 1925 - 1934 | 1925-01-01 00:00:00 | 1934-01-01 00:00:00 | Open | Sydney | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
0 | register | 6 |
1 | names | 6 |
2 | relating | 6 |
3 | exemption | 6 |
4 | dictation | 6 |
5 | tests | 6 |
6 | 1911-1918 | 1 |
7 | 1918-1925 | 1 |
8 | 1902-1910 | 1 |
9 | 1919-1924 | 1 |
10 | 1925-1934 | 1 |
11 | 1934-1959 | 1 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | register of | 6 |
1 | exemption from | 6 |
2 | dictation tests | 6 |
3 | of names | 6 |
4 | names relating | 6 |
5 | from dictation | 6 |
6 | relating to | 6 |
7 | to exemption | 6 |
8 | tests 1934-1959 | 1 |
9 | tests 1918-1925 | 1 |
10 | 1919-1924 register | 1 |
11 | 1902-1910 register | 1 |
12 | 1918-1925 register | 1 |
13 | 1925-1934 register | 1 |
14 | tests 1911-1918 | 1 |
15 | tests 1902-1910 | 1 |
16 | 1911-1918 register | 1 |
17 | tests 1925-1934 | 1 |
18 | tests 1919-1924 | 1 |