series = 'D596'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 11,395 |
---|---|
Access status | |
Not yet examined | 8,381 (73.55%) |
Open | 2,983 (26.18%) |
Open with exception | 31 (0.27%) |
Number of items digitised | 185 (1.62%) |
Number of pages digitised | 3,031 |
Date of earliest content | 1871 |
Date of latest content | 1971 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 319709 | D596 | 1902/647 | Immigration Restriction Act - Domicile Certificate | 1902 - 1902 | 1902-01-01 00:00:00 | 1902-01-01 00:00:00 | Open | Adelaide | False | 0 |
1 | 319888 | D596 | 1908/5433 | Chinese prohibited immigrants | 1908 - 1908 | 1908-01-01 00:00:00 | 1908-01-01 00:00:00 | Open | Adelaide | False | 0 |
2 | 320267 | D596 | 1914/5906 | War between Great Britain & Turkey - Proclamation | 1914 - 1914 | 1914-01-01 00:00:00 | 1914-01-01 00:00:00 | Open | Adelaide | False | 0 |
3 | 320290 | D596 | 1914/6869 | Proclamation extending the scope of certain existing proclamations and a certain order in Council connected with the war | 1914 - 1914 | 1914-01-01 00:00:00 | 1914-01-01 00:00:00 | Open | Adelaide | False | 0 |
4 | 320382 | D596 | 1916/1544 | Public Trustee Herman P ZONDER - enemy shareholder | 1917 - 1918 | 1917-01-01 00:00:00 | 1918-01-01 00:00:00 | Open | Adelaide | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
353 | ltd | 946 |
2 | act | 937 |
3931 | passport | 935 |
376 | ss | 813 |
0 | immigration | 749 |
241 | co | 739 |
3893 | classification | 717 |
207 | adelaide | 601 |
1388 | enquiry | 598 |
112 | customs | 581 |
58 | report | 534 |
5344 | wife | 509 |
423 | tariff | 480 |
6 | prohibited | 450 |
106 | mr | 434 |
206 | port | 427 |
50 | ex | 426 |
274 | duty | 413 |
204 | shipping | 377 |
3114 | permit | 370 |
666 | claim | 364 |
161 | australia | 353 |
321 | goods | 350 |
289 | office | 347 |
298 | regarding | 336 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | enquiry by | 537 |
1 | immigration report | 497 |
2 | classification of | 406 |
3 | by wife | 366 |
4 | claim no | 325 |
5 | merchant shipping | 282 |
6 | shipping act | 280 |
7 | co ltd | 270 |
8 | clearing office | 251 |
9 | office claim | 238 |
10 | application for | 233 |
11 | port adelaide | 209 |
12 | tariff classification | 203 |
13 | prohibited publication | 199 |
14 | immigration act | 190 |
15 | passport enquiry | 181 |
16 | landing permit | 173 |
17 | at port | 169 |
18 | official no | 167 |
19 | report mv | 154 |
20 | of the | 149 |
21 | crew ss | 137 |
22 | for duty | 132 |
23 | pty ltd | 130 |
24 | transfer of | 127 |