series = 'B13'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 20,194 |
---|---|
Access status | |
Open | 19,786 (97.98%) |
Not yet examined | 400 (1.98%) |
Open with exception | 8 (0.04%) |
Number of items digitised | 354 (1.75%) |
Number of pages digitised | 5,043 |
Date of earliest content | 1800 |
Date of latest content | 2005 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 787258 | B13 | 1924/7516 | Charlie Lam Sun (Charlie Shack Mayberry) - Arrived Sydney per "Taiyuan" 15.3.1924 | 1924 - circa1924 | 1924-01-01 00:00:00 | NaT | Open | Melbourne | False | 0 |
1 | 790335 | B13 | 1926/6755 | Edward Traynor - permission to enter Australia - arrived per "Beltana" 13.5.1926 | 1926 - 1926 | 1926-01-01 00:00:00 | 1926-01-01 00:00:00 | Open | Melbourne | False | 0 |
2 | 3280504 | B13 | V1960/14261 | Tabacco sales in Victoria [1.00 cms] | 1960 - 1962 | 1960-01-01 00:00:00 | 1962-01-01 00:00:00 | Open | Melbourne | False | 0 |
3 | 3280538 | B13 | V1979/4475 | James Richardson Co Pty Ltd, Licensed Warehouse, Richardsons Bond [Contains plans of Richardssons Bons] [4.00 cms] | 1963 - 1984 | 1963-01-01 00:00:00 | 1984-01-01 00:00:00 | Open with exception | Melbourne | False | 0 |
4 | 3283801 | B13 | V1953/12491 | Tobacco & Cigarettes - Duty Free issues to Ships Crews [2cm] | 1945 - 1957 | 1945-01-01 00:00:00 | 1957-01-01 00:00:00 | Open | Melbourne | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
7 | per | 5,172 |
314 | ex | 4,363 |
902 | exemption | 3,688 |
618 | certificate | 3,577 |
1655 | dictation | 3,577 |
1581 | test | 3,553 |
71 | melbourne | 3,168 |
538 | application | 2,442 |
174 | departure | 2,006 |
14 | australia | 1,977 |
1543 | ah | 1,796 |
949 | passengers | 1,620 |
173 | arrival | 1,560 |
26 | ltd | 1,446 |
104 | act | 1,180 |
1482 | mrs | 1,175 |
6 | sydney | 1,075 |
1861 | s.s | 1,074 |
12 | permission | 1,050 |
830 | crew | 1,015 |
25 | pty | 950 |
1621 | applied | 927 |
1583 | chinese | 862 |
2049 | enemy | 858 |
24 | co | 835 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | exemption from | 3,550 |
1 | from dictation | 3,540 |
2 | dictation test | 3,533 |
3 | for exemption | 3,004 |
4 | certificate for | 2,872 |
5 | for certificate | 2,660 |
6 | application for | 2,236 |
7 | melbourne per | 1,054 |
8 | departure per | 1,009 |
9 | pty ltd | 927 |
10 | applied for | 923 |
11 | to australia | 859 |
12 | trading with | 782 |
13 | enemy act | 765 |
14 | with enemy | 764 |
15 | test ah | 735 |
16 | permission to | 698 |
17 | act 1939 | 691 |
18 | of exemption | 636 |
19 | certificate of | 625 |
20 | crew member | 623 |
21 | arrival per | 458 |
22 | to enter | 437 |
23 | of certificate | 419 |
24 | passengers melbourne | 390 |