series = 'PP6/1'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 6,010 |
---|---|
Access status | |
Not yet examined | 4,109 (68.37%) |
Open | 1,863 (31.00%) |
Open with exception | 33 (0.55%) |
Closed | 5 (0.08%) |
Number of items digitised | 245 (4.08%) |
Number of pages digitised | 6,461 |
Date of earliest content | 1906 |
Date of latest content | 1978 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 326472 | PP6/1 | 1931/H/318 | Maltese Migration | 1931 - 1931 | 1931-01-01 00:00:00 | 1931-01-01 00:00:00 | Open | Perth | False | 0 |
1 | 326477 | PP6/1 | 1927/H/325 | Ah Moy [Chinese] Application for Certificate of Exemption from Dictation Test [CEDT] [contains photos] | 1911 - 1929 | 1911-01-01 00:00:00 | 1929-01-01 00:00:00 | Open | Perth | False | 0 |
2 | 326492 | PP6/1 | 1927/H/427 | Immigration Act 1901-1925 Dictation Test [contains set of directions to be observed when applying test] | 1927 - 1927 | 1927-01-01 00:00:00 | 1927-01-01 00:00:00 | Open | Perth | True | 5 |
3 | 326496 | PP6/1 | 1927/H/533 | George Albert Orwin - Re son Leslie Orwin entering Australia suffering from exema | 1927 - 1928 | 1927-01-01 00:00:00 | 1928-01-01 00:00:00 | Open | Perth | False | 0 |
4 | 326503 | PP6/1 | 1927/H/567 | Applcation for admission of Eleanor Easom by her son Robert Easom | 1927 - 1927 | 1927-01-01 00:00:00 | 1927-01-01 00:00:00 | Open | Perth | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
5 | application | 4,983 |
26 | australia | 4,325 |
30 | admission | 4,156 |
948 | naturalisation | 578 |
380 | giuseppe | 522 |
459 | maria | 342 |
100 | antonio | 335 |
2177 | giovanni | 297 |
357 | francesco | 250 |
492 | domenico | 214 |
394 | vincenzo | 196 |
42 | john | 196 |
20 | george | 172 |
504 | luigi | 168 |
892 | de | 165 |
323 | permanent | 158 |
7 | exemption | 153 |
482 | pietro | 150 |
6 | certificate | 140 |
1689 | residence | 136 |
881 | salvatore | 131 |
884 | angelo | 128 |
1295 | carmelo | 110 |
455 | michele | 108 |
195 | peter | 93 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | application for | 4,907 |
1 | to australia | 4,099 |
2 | for admission | 4,065 |
3 | admission of | 4,006 |
4 | for naturalisation | 542 |
5 | of giuseppe | 176 |
6 | australia giuseppe | 166 |
7 | in australia | 145 |
8 | of exemption | 142 |
9 | permanent residence | 132 |
10 | for permanent | 128 |
11 | certificate of | 124 |
12 | australia giovanni | 118 |
13 | of maria | 112 |
14 | for certificate | 106 |
15 | residence in | 104 |
16 | australia antonio | 99 |
17 | of francesco | 98 |
18 | australia john | 83 |
19 | admission to | 81 |
20 | of antonio | 77 |
21 | of giovanni | 76 |
22 | australia vincenzo | 74 |
23 | of domenico | 68 |
24 | australia domenico | 68 |