series = 'SP115/1'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 1,787 |
---|---|
Access status | |
Open | 1,787 (100.00%) |
Number of items digitised | 9 (0.50%) |
Number of pages digitised | 285 |
Date of earliest content | 1884 |
Date of latest content | 1943 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1592127 | SP115/1 | UGANDA - 13/05/1915 [BOX 15] | UGANDA - Date of Arrival 13/05/1915 - [Certificates of Exemption for passengers; includes photographs and hand prints][Box 15] | 1912 - 1915 | 1912-01-01 00:00:00 | 1915-01-01 00:00:00 | Open | Sydney | False | 0 |
1 | 1592383 | SP115/1 | JOSEPH SIMMS - 13/05/1915 [BOX 15] | JOSEPH SIMMS - Date of Arrival 13/05/1915 [Certificates of Exemption for passengers; includes photographs and hand prints][Box 15] | 1914 - 1915 | 1914-01-01 00:00:00 | 1915-01-01 00:00:00 | Open | Sydney | False | 0 |
2 | 1592840 | SP115/1 | TAIYUAN - [PART 1] - 30/05/1915 [BOX 15] | TAIYUAN - [Part 1] - Date of Arrival 30/05/1915 - [Certificates of Exemption for passengers; includes photographs and hand prints][[Box 15] | 1914 - 1915 | 1914-01-01 00:00:00 | 1915-01-01 00:00:00 | Open | Sydney | False | 0 |
3 | 1592858 | SP115/1 | TAIYUAN -[PART 2] - 30/05/1915 [BOX 15] | TAIYUAN -[Part 2] - Date of Arrival 30/05/1915 - [Certificates of Exemption for passengers; includes photographs and hand prints][Box 15] | 1905 - 1915 | 1905-01-01 00:00:00 | 1915-01-01 00:00:00 | Open | Sydney | False | 0 |
4 | 1592871 | SP115/1 | EASTERN - [PART 1] - 05/06/1915 [BOX 15] | EASTERN - [Part 1] - Date of Arrival 05/06/1915 [Certificates of Exemption for passengers; includes photographs and hand prints][Box 15] | 1914 - 1915 | 1914-01-01 00:00:00 | 1915-01-01 00:00:00 | Open | Sydney | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
4 | certificates | 1,773 |
11 | box | 1,763 |
5 | exemption | 1,697 |
6 | passengers | 1,696 |
7 | includes | 1,690 |
1 | date | 1,687 |
8 | photographs | 1,684 |
2 | arrival | 1,682 |
9 | hand | 1,612 |
10 | prints | 1,612 |
31 | pages | 664 |
16 | part | 628 |
39 | 2cm | 415 |
17 | 1 | 388 |
19 | 2 | 255 |
60 | maru | 189 |
22 | 3 | 169 |
23 | 4 | 156 |
36 | st | 140 |
58 | 1cm | 137 |
37 | albans | 135 |
24 | 5 | 130 |
20 | eastern | 126 |
749 | taiping | 119 |
689 | tanda | 105 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | of exemption | 1,697 |
1 | exemption for | 1,696 |
2 | for passengers | 1,696 |
3 | passengers includes | 1,689 |
4 | date of | 1,687 |
5 | certificates of | 1,686 |
6 | includes photographs | 1,683 |
7 | photographs and | 1,683 |
8 | of arrival | 1,682 |
9 | hand prints | 1,612 |
10 | and hand | 1,606 |
11 | pages box | 655 |
12 | 2cm box | 404 |
13 | prints 2cm | 399 |
14 | prints box | 350 |
15 | 1 date | 233 |
16 | part 1 | 232 |
17 | part 2 | 220 |
18 | 2 date | 218 |
19 | 1cm box | 136 |
20 | st albans | 135 |
21 | prints 1cm | 118 |
22 | eastern part | 115 |
23 | 3 date | 112 |
24 | part 3 | 111 |