series = 'P437'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 4,958 |
---|---|
Access status | |
Open | 4,945 (99.74%) |
Open with exception | 10 (0.20%) |
Not yet examined | 2 (0.04%) |
Closed | 1 (0.02%) |
Number of items digitised | 18 (0.36%) |
Number of pages digitised | 442 |
Date of earliest content | 1901 |
Date of latest content | 1940 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 538211 | P437 | 1940/279 | Restricted drugs | 1939 - 1940 | 1939-01-01 00:00:00 | 1940-01-01 00:00:00 | Closed | Hobart | False | 0 |
1 | 542152 | P437 | WHOLE SERIES | Correspondence files of the Collector of Customs, Hobart for the period 1908 to 1940; covers such topics as tariffs, trade, duty, immigration, export permits, patents lighthouse service, ship wrecks, passports,various grant and Bounty schemes | 1908 - 1940 | 1908-01-01 00:00:00 | 1940-01-01 00:00:00 | Open with exception | Hobart | False | 0 |
2 | 635923 | P437 | 1910/12 | Accounts - postage paid - Board of Trade Journals | 1909 - 1910 | 1909-01-01 00:00:00 | 1910-01-01 00:00:00 | Open | Hobart | False | 0 |
3 | 642439 | P437 | 1910/14 | Bank guarantees - cancellation of - The Commercial Bank of Tasmania Ltd. | 1910 - 1910 | 1910-01-01 00:00:00 | 1910-01-01 00:00:00 | Open | Hobart | False | 0 |
4 | 642442 | P437 | 1910/15 | Imports of Fire Arms to Tasmania - January to June 1909. Returns. | 1910 - 1910 | 1910-01-01 00:00:00 | 1910-01-01 00:00:00 | Open | Hobart | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
85 | request | 508 |
14 | duty | 382 |
5 | customs | 345 |
252 | act | 238 |
107 | return | 213 |
37 | tasmania | 200 |
127 | goods | 196 |
39 | imports | 186 |
219 | mr | 177 |
16 | export | 161 |
61 | application | 155 |
200 | company | 149 |
332 | ss | 135 |
6 | hobart | 134 |
146 | import | 128 |
89 | forms | 127 |
125 | list | 121 |
195 | forwarded | 118 |
100 | certificate | 117 |
292 | launceston | 114 |
141 | imported | 102 |
302 | beer | 102 |
189 | officers | 97 |
212 | invoice | 95 |
108 | commonwealth | 94 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | request for | 354 |
1 | to be | 121 |
2 | application for | 111 |
3 | of customs | 88 |
4 | duty on | 83 |
5 | of duty | 82 |
6 | certificate of | 82 |
7 | of the | 76 |
8 | for the | 72 |
9 | return of | 68 |
10 | and company | 66 |
11 | export of | 64 |
12 | with the | 62 |
13 | commerce act | 61 |
14 | import of | 61 |
15 | collector of | 59 |
16 | list of | 59 |
17 | return showing | 57 |
18 | imports of | 56 |
19 | being forwarded | 55 |
20 | trading with | 55 |
21 | of exemption | 54 |
22 | the enemy | 53 |
23 | importation of | 51 |
24 | of goods | 49 |