import os
import pandas as pd
from IPython.display import Image as DImage
from IPython.core.display import display, HTML
import series_details
# Plotly helps us make pretty charts
import plotly.offline as py
import plotly.graph_objs as go
# This lets Plotly draw charts in cells
py.init_notebook_mode()
This notebook is for analysing a series that you've already harvested. If you haven't harvested any data yet, then you need to go back to the ['Harvesting a series' notebook](Harvesting series.ipynb).
# What series do you want to analyse?
# Insert the series id between the quotes.
series = 'B13'
# Load the CSV data for the specified series into a dataframe. Parse the dates as dates!
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
We're going to create a simple summary of some of the main characteristics of the series, as reflected in the harvested files.
# We're going to assemble some summary data about the series in a 'summary' dictionary
# Let's create the dictionary and add the series identifier
summary = {'series': series}
# The 'shape' property returns the number of rows and columns. So 'shape[0]' gives us the number of items harvested.
summary['total_items'] = df.shape[0]
print(summary['total_items'])
20194
# Get the frequency of the different access status categories
summary['access_counts'] = df['access_status'].value_counts().to_dict()
print(summary['access_counts'])
{'Open': 19786, 'Not yet examined': 400, 'Open with exception': 8}
# Get the number of files that have been digitised
summary['digitised_files'] = len(df.loc[df['digitised_status'] == True])
print(summary['digitised_files'])
354
# Get the number of individual pages that have been digitised
summary['digitised_pages'] = df['digitised_pages'].sum()
print(summary['digitised_pages'])
5043
# Get the earliest start date
summary['date_from'] = df['start_date'].min().year
print(summary['date_from'])
1800
# Get the latest end date
summary['date_to'] = df['end_date'].max().year
print(summary['date_to'])
2005
# Let's display all the summary data
print('SERIES: {}'.format(summary['series']))
print('Number of items: {:,}'.format(summary['total_items']))
print('Access status:')
for status, total in summary['access_counts'].items():
print(' {}: {:,}'.format(status, total))
print('Contents dates: {} to {}'.format(summary['date_from'], summary['date_to']))
print('Digitised files: {:,}'.format(summary['digitised_files']))
print('Digitised pages: {:,}'.format(summary['digitised_pages']))
SERIES: B13 Number of items: 20,194 Access status: Open: 19,786 Not yet examined: 400 Open with exception: 8 Contents dates: 1800 to 2005 Digitised files: 354 Digitised pages: 5,043
Note that a slightly enhanced version of the code above is available in the series_details
module that you can import into any notebook. So to create a summary of a series you can just:
# Import the module
import series_details
# Call display_series() providing the series name and the dataframe
series_details.display_summary(series, df)
Total items | 20,194 |
---|---|
Access status | |
Open | 19,786 (97.98%) |
Not yet examined | 400 (1.98%) |
Open with exception | 8 (0.04%) |
Number of items digitised | 354 (1.75%) |
Number of pages digitised | 5,043 |
Date of earliest content | 1800 |
Date of latest content | 2005 |
Plotting the dates is a bit tricky. Each file can have both a start date and an end date. So if we want to plot the years covered by a file, we need to include all the years between the start and end dates. Also dates can be recorded at different levels of granularity, for specific days to just years. And sometimes there are no end dates recorded at all – what does this mean?
The code in the cell below does a few things:
I'm sure this is not perfect, but it seems to produce useful results.
# Fill any blank end dates with start dates
df['end_date'] = df[['end_date']].apply(lambda x: x.fillna(value=df['start_date']))
# This is a bit tricky.
# For each item we want to find the years that it has content from -- ie start_year <= year <= end_year.
# Then we want to put all the years from all the items together and look at their frequency
years = pd.concat([pd.date_range(
start=row.start_date,
end=row.end_date,
freq='AS').year.to_series() for row in df.itertuples(index=False)]).value_counts()
# Put the resulting series in a dataframe so it looks pretty.
year_totals = pd.DataFrame(years)
# Sort results by year
year_totals.sort_index(inplace=True)
# Display the results
year_totals.style.format({0: '{:,}'})
0 | |
---|---|
1800 | 1 |
1898 | 2 |
1899 | 2 |
1900 | 3 |
1901 | 4 |
1902 | 34 |
1903 | 20 |
1904 | 12 |
1905 | 13 |
1906 | 16 |
1907 | 17 |
1908 | 19 |
1909 | 33 |
1910 | 47 |
1911 | 50 |
1912 | 113 |
1913 | 106 |
1914 | 96 |
1915 | 130 |
1916 | 91 |
1917 | 86 |
1918 | 88 |
1919 | 113 |
1920 | 136 |
1921 | 148 |
1922 | 777 |
1923 | 920 |
1924 | 1,044 |
1925 | 1,154 |
1926 | 1,498 |
1927 | 1,431 |
1928 | 1,328 |
1929 | 1,213 |
1930 | 1,128 |
1931 | 998 |
1932 | 848 |
1933 | 939 |
1934 | 949 |
1935 | 973 |
1936 | 1,010 |
1937 | 1,076 |
1938 | 155 |
1939 | 981 |
1940 | 1,572 |
1941 | 274 |
1942 | 211 |
1943 | 162 |
1944 | 174 |
1945 | 175 |
1946 | 251 |
1947 | 117 |
1948 | 121 |
1949 | 134 |
1950 | 146 |
1951 | 157 |
1952 | 166 |
1953 | 180 |
1954 | 194 |
1955 | 200 |
1956 | 221 |
1957 | 238 |
1958 | 251 |
1959 | 266 |
1960 | 270 |
1961 | 273 |
1962 | 278 |
1963 | 280 |
1964 | 280 |
1965 | 285 |
1966 | 286 |
1967 | 289 |
1968 | 288 |
1969 | 292 |
1970 | 296 |
1971 | 296 |
1972 | 280 |
1973 | 275 |
1974 | 255 |
1975 | 244 |
1976 | 225 |
1977 | 205 |
1978 | 187 |
1979 | 172 |
1980 | 171 |
1981 | 151 |
1982 | 134 |
1983 | 122 |
1984 | 116 |
1985 | 92 |
1986 | 76 |
1987 | 62 |
1988 | 53 |
1989 | 41 |
1990 | 33 |
1991 | 23 |
1992 | 20 |
1993 | 22 |
1994 | 19 |
1995 | 24 |
1996 | 14 |
1997 | 10 |
1998 | 7 |
1999 | 6 |
2000 | 4 |
2001 | 3 |
2002 | 2 |
2003 | 1 |
2004 | 1 |
2005 | 1 |
# Let's graph the frequency of content years
plotly_data = [go.Bar(
x=year_totals.index.values, # The years are the index
y=year_totals[0]
)]
# Add some labels
layout = go.Layout(
title='Content dates',
xaxis=dict(
title='Year'
),
yaxis=dict(
title='Number of items'
)
)
# Create a chart
fig = go.Figure(data=plotly_data, layout=layout)
py.iplot(fig, filename='series-dates-bar')
Note that a slightly enhanced version of the code above is available in the series_details module that you can import into any notebook. So to create a summary of a series you can just:
# Import the module
import series_details
# Call plot_series() providing the series name and the dataframe
series_details.plot_dates(df)
# Find titles containing a particular phrase -- in this case 'wife'
# This creates a new dataframe called 'df_wives'
# Try changing this to filter for other words
search_term = 'wife'
df_filtered = df.loc[df['title'].str.contains(search_term, case=False)].copy()
df_filtered
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-e0911c59875d> in <module>() 4 5 search_term = 'wife' ----> 6 df_filtered = df.loc[df['title'].str.contains(search_term, case=False)].copy() 7 df_filtered NameError: name 'df' is not defined
# We can plot this filtered dataframe just like the series
series_details.plot_dates(df_filtered)
# Save the new dataframe as a csv
df_filtered.to_csv(os.path.join('data', '{}-{}.csv'.format(series.replace('/', '-'), search_term)))
# Find titles containing one of two words -- ie an OR statement
# Try changing this to filter for other words
df_filtered = df.loc[df['title'].str.contains('chinese', case=False) | df['title'].str.contains(r'\bah\b', case=False)].copy()
df_filtered
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
494 | 406808 | B13 | 1912/11171 | Application for Exemption Certificate, Ah Fan | 1912 - 1912 | 1912-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
496 | 406815 | B13 | 1912/6389 | Application for Exemption Certificate, Charlie... | 1912 - 1912 | 1912-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
499 | 406824 | B13 | 1912/10341 | Application for Exemption Certificate, Ah Yow | 1912 - 1912 | 1912-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
501 | 406830 | B13 | 1912/4220 | Application for Exemption Certificate, Ah Wah | 1912 - 1912 | 1912-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
506 | 406850 | B13 | 1912/450 | Application for Exemption Certificate, Ah Get | 1911 - 1912 | 1911-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
508 | 406857 | B13 | 1911/21298 | Application for Certificates of Exemption, Ah ... | 1911 - 1913 | 1911-01-01 | 1913-01-01 | Open | Melbourne | False | 0 |
509 | 406860 | B13 | 1911/21305 | Application for Certificate of Exemption, Ah C... | 1911 - 1913 | 1911-01-01 | 1913-01-01 | Open | Melbourne | False | 0 |
511 | 406868 | B13 | 1912/433 | Cutting from "Sunday Times" W.A. re Lee Keong ... | 1912 - 1912 | 1912-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
513 | 406880 | B13 | 1911/13173 | Application for Exemption Certificate, Ah Goune | 1911 - 1913 | 1911-01-01 | 1913-01-01 | Open | Melbourne | False | 0 |
514 | 406883 | B13 | 1911/14554 | Prohibited Immigrant, Herbert Ah Loy | 1911 - 1911 | 1911-01-01 | 1911-01-01 | Open | Melbourne | False | 0 |
516 | 406893 | B13 | 1910/6136 | Application for Exemption Certificate, George ... | 1910 - 1912 | 1910-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
519 | 406910 | B13 | 1911/4400 | Application for Exemption Certificate, Ah Cheong | 1911 - 1912 | 1911-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
520 | 406915 | B13 | 1911/11444 | Application for Exemption Certificate, Ah Hee | 1911 - 1914 | 1911-01-01 | 1914-01-01 | Open | Melbourne | False | 0 |
527 | 406943 | B13 | 1903/7130 | Admittance of Chinese to Commonwealth on prese... | 1903 - 1903 | 1903-01-01 | 1903-01-01 | Open | Melbourne | False | 0 |
528 | 406947 | B13 | 1903/9491 | (Local) Chinese, being permitted to go onboard... | 1903 - 1903 | 1903-01-01 | 1903-01-01 | Open | Melbourne | False | 0 |
530 | 407082 | B13 | 1902/4295 | Query- is wife of Chinese who has been a resid... | 1902 - 1902 | 1902-01-01 | 1902-01-01 | Open | Melbourne | False | 0 |
534 | 407104 | B13 | 1902/547 | Chinese coming from another State. Will they b... | 1902 - 1902 | 1902-01-01 | 1902-01-01 | Open | Melbourne | False | 0 |
535 | 407121 | B13 | 1902/713 | Immigration Restriction Act to be applied in p... | 1902 - 1902 | 1902-01-01 | 1902-01-01 | Open | Melbourne | False | 0 |
536 | 407127 | B13 | 1902/1166 | Care to be exercised in admission of Chinese o... | 1902 - 1902 | 1902-01-01 | 1902-01-01 | Open | Melbourne | False | 0 |
537 | 407133 | B13 | 1902/2793 | Opinion of Attorney General as to whether a Ch... | 1902 - 1902 | 1902-01-01 | 1902-01-01 | Open | Melbourne | False | 0 |
540 | 407154 | B13 | 1902/464 | Immigration Restriction Act. Re Chinese passin... | 1902 - 1902 | 1902-01-01 | 1902-01-01 | Open | Melbourne | False | 0 |
541 | 407161 | B13 | 1902/487 | Re Chinese (4) on board the "Clitus". Are 2 Sp... | 1902 - 1902 | 1902-01-01 | 1902-01-01 | Open | Melbourne | False | 0 |
542 | 407168 | B13 | 1902/3720 | Re Certificate of Domicile for Ah Poy. Asks if... | 1902 - 1902 | 1902-01-01 | 1902-01-01 | Open | Melbourne | False | 0 |
544 | 407185 | B13 | 1919/5628 | Quan Ah Sam - Refused Certificate for Exemptio... | 1919 - 1919 | 1919-01-01 | 1919-01-01 | Open | Melbourne | False | 0 |
548 | 407212 | B13 | 1905/5371 | Appeal of certain Chinese against conviction a... | 1905 - 1905 | 1905-01-01 | 1905-01-01 | Open | Melbourne | False | 0 |
549 | 407220 | B13 | 1908/14693 | Attempts made to effect substitution of other ... | 1908 - 1908 | 1908-01-01 | 1908-01-01 | Open | Melbourne | False | 0 |
557 | 407277 | B13 | 1909/11714 | Illicit entry of Chinese presenting Naturaliza... | 1909 - 1909 | 1909-01-01 | 1909-01-01 | Open | Melbourne | False | 0 |
558 | 407283 | B13 | 1909/16634 | Ah Bing applies for Certificate under Section ... | 1909 - 1909 | 1909-01-01 | 1909-01-01 | Open | Melbourne | False | 0 |
562 | 407303 | B13 | 1909/3855 | Family & Staff of Mr. Liang Lan-Hsun Chinese C... | 1909 - 1909 | 1909-01-01 | 1909-01-01 | Open | Melbourne | False | 0 |
567 | 407326 | B13 | 1908/8431 | Ah Woo appln for Cert. | 1906 - 1906 | 1906-01-01 | 1906-01-01 | Open | Melbourne | False | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
19827 | 5945296 | B13 | 1923/24954 | Ah Yuck - applies for a Certificate of Exempti... | 1923 - 1923 | 1923-01-01 | 1923-01-01 | Open | Melbourne | False | 0 |
19829 | 5945298 | B13 | 1924/2065 | Chinese passengers per SS ARAFURA permitted to... | 1924 - 1924 | 1924-01-01 | 1924-01-01 | Open | Melbourne | False | 0 |
19836 | 5945305 | B13 | 1924/14752 | Ah Quon, Application for Certificate of Exempt... | 1924 - 1924 | 1924-01-01 | 1924-01-01 | Open | Melbourne | False | 0 |
19842 | 5945400 | B13 | 1926/8111 | George Lum, Ah Fang, Ah Hing - Victorian Certi... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
19843 | 5945401 | B13 | 1926/8940 | Ah Dow Certificate of Exemption from Dictation... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
19856 | 5945414 | B13 | 1926/25647 | Chinese passengers arriving at Melbourne on 16... | 1926 - 1927 | 1926-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
19863 | 5945421 | B13 | 1926/11270 | Ah Din application for Certificate of Exemptio... | 1925 - 1930 | 1925-01-01 | 1930-01-01 | Open | Melbourne | False | 0 |
19913 | 5953257 | B13 | 1932/8267 | Exemption from Dictation Test of Chinese perso... | 1932 - 1932 | 1932-01-01 | 1932-01-01 | Open | Melbourne | False | 0 |
19917 | 5953261 | B13 | 1932/6671 | Kee Sik Kwai and Chow Mow Pun - Chinese ex SS ... | 1932 - 1932 | 1932-01-01 | 1932-01-01 | Open | Melbourne | False | 0 |
19954 | 6045355 | B13 | 1936/11020 | Chin Wat, Lam Kee, Louey Doon, Chinese passeng... | 1936 - 1936 | 1936-01-01 | 1936-01-01 | Open | Melbourne | False | 0 |
19986 | 6551572 | B13 | 1925/27508 | Ah Kong Application for a Certificate of Exemp... | 1925 - 1927 | 1925-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
19990 | 6551576 | B13 | 1925/14137 | Ah Goon and Mah Wah arrival in Melbourne per S... | 1925 - 1925 | 1925-01-01 | 1925-01-01 | Open | Melbourne | False | 0 |
19995 | 6551581 | B13 | 1925/24262 | Rejected application by Mr Ah On from the firm... | 1925 - 1926 | 1925-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20043 | 6553344 | B13 | 1933/14364 | Chia Tak Eng, Chinese member of crew of s.s. "... | 1933 - 1933 | 1933-01-01 | 1933-01-01 | Open | Melbourne | False | 0 |
20061 | 6553362 | B13 | 1937/16107 | Ah Wing - application for C.E.D.T. (Certificat... | 1926 - 1940 | 1926-01-01 | 1940-01-01 | Open | Melbourne | False | 0 |
20095 | 10311991 | B13 | 1933/24224 | Chin Ah Leong aka Willie C Long applies for Ce... | 1927 - 1933 | 1927-01-01 | 1933-01-01 | Open | Melbourne | False | 0 |
20115 | 10535295 | B13 | 1933/23067 | Ah Wee [Ah Way] - Apllication for CEDT | 1898 - 1946 | 1898-01-01 | 1946-01-01 | Open | Melbourne | False | 0 |
20117 | 10538200 | B13 | 1933/24225 | Ah Joe applies for Certificate Exempting from ... | 1926 - 1933 | 1926-01-01 | 1933-01-01 | Open | Melbourne | False | 0 |
20123 | 10559103 | B13 | 1911/2239 | Ah Jack - Application for Certificate of Exemp... | 1910 - 1916 | 1910-01-01 | 1916-01-01 | Open | Melbourne | False | 0 |
20127 | 11979730 | B13 | 1912/12664 | Poon Ah Soo - Certificate of Exemption from Di... | 1912 - 1912 | 1912-01-01 | 1912-01-01 | Open | Melbourne | False | 0 |
20130 | 11993962 | B13 | 1932/5459 | Arrival of Chinese passengers ex S.S. Taiping | 1932 - 1932 | 1932-01-01 | 1932-01-01 | Open | Melbourne | False | 0 |
20152 | 30762921 | B13 | 1916/6339 | Ah Lock, Application for C.E.D.T [includes pho... | 1916 - 1916 | 1916-01-01 | 1916-01-01 | Open | Melbourne | False | 0 |
20158 | 30762927 | B13 | 1916/7813 | Ah Chow, Application for C.E.D.T [includes pho... | 1916 - 1916 | 1916-01-01 | 1916-01-01 | Open | Melbourne | False | 0 |
20159 | 30762929 | B13 | 1922/9340 | Collector of Customs - Melbourne - Five Chines... | 1922 - 1922 | 1922-01-01 | 1922-01-01 | Open | Melbourne | False | 0 |
20160 | 30762930 | B13 | 1922/9690 | C.E.D.T Book 259 Number 72 relating to Chinese... | 1922 - 1922 | 1922-01-01 | 1922-01-01 | Open | Melbourne | False | 0 |
20161 | 30762931 | B13 | 1913/484 | Ah Louey; Application for C.E.D.T [includes p... | 1913 - 1914 | 1913-01-01 | 1914-01-01 | Open | Melbourne | False | 0 |
20166 | 30762942 | B13 | 1924/8009 | Ah hing and Tang Cheong; Deserters from the S.... | 1924 - 1924 | 1924-01-01 | 1924-01-01 | Open | Melbourne | False | 0 |
20179 | 30762957 | B13 | 1926/24036 | Ah Hing - C.E.D.T in favour, leaving port Melb... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20182 | 30762960 | B13 | 1926/26816 | Ah Jick - Application for C.E.D.T | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20185 | 30762974 | B13 | 1929/22619 | Ah Jim; Application for C.E.D.T | 1929 - 1939 | 1929-01-01 | 1939-01-01 | Open | Melbourne | False | 0 |
2457 rows × 11 columns
start_year = '1920'
end_year = '1930'
df_filtered = df[(df['start_date'] >= start_year) & (df['end_date'] <= end_year)]
df_filtered
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 787258 | B13 | 1924/7516 | Charlie Lam Sun (Charlie Shack Mayberry) - Arr... | 1924 - circa1924 | 1924-01-01 | 1924-01-01 | Open | Melbourne | False | 0 |
1 | 790335 | B13 | 1926/6755 | Edward Traynor - permission to enter Australia... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
397 | 405608 | B13 | 1930/7915 | Prospective Italian Migrants: Family of Pitron... | 1930 - 1930 | 1930-01-01 | 1930-01-01 | Open | Melbourne | False | 0 |
398 | 405614 | B13 | 1930/14816 | Alien Migration to Australia: Landing Money re... | 1928 - 1930 | 1928-01-01 | 1930-01-01 | Open | Melbourne | False | 0 |
399 | 405618 | B13 | 1930/18541 | Passengers for New Zealand aboard R.M.S. "Orvi... | 1930 - 1930 | 1930-01-01 | 1930-01-01 | Open | Melbourne | False | 0 |
470 | 406689 | B13 | 1930/16951 | Immigration Act - 1901-1925 - Deportation for ... | 1927 - 1930 | 1927-01-01 | 1930-01-01 | Open | Melbourne | False | 0 |
619 | 407878 | B13 | 1926/13464 | Mowsey Inagaki, ex S.S. "Tango Maru", | 1924 - 1926 | 1924-01-01 | 1926-01-01 | Open | Melbourne | True | 4 |
620 | 407909 | B13 | 1929/16370 | Lazare Morel, Mauritius deserter ex "King John... | 1929 - 1929 | 1929-01-01 | 1929-01-01 | Open | Melbourne | False | 0 |
639 | 408060 | B13 | 1930/510 | Request by Customs, N.S.W., for verification a... | 1930 - 1930 | 1930-01-01 | 1930-01-01 | Open | Melbourne | False | 0 |
640 | 408067 | B13 | 1930/9190 | Alfredo Debono - Contract immigrant ex S.S. "B... | 1930 - 1930 | 1930-01-01 | 1930-01-01 | Open | Melbourne | False | 0 |
641 | 408074 | B13 | 1929/12987 | Emmanuel Vassalo - restricted crew member S.S.... | 1929 - 1929 | 1929-01-01 | 1929-01-01 | Open | Melbourne | False | 0 |
642 | 408080 | B13 | 1927/18773 | Application from Joseph Gauci to bring nephew,... | 1927 - 1927 | 1927-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
643 | 408083 | B13 | 1927/25816 | Re Francis Grech, Maltese | 1927 - 1927 | 1927-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
644 | 408086 | B13 | 1927/9675 | Michael Caruana - re endorsement/renewal of pa... | 1922 - 1927 | 1922-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
645 | 408092 | B13 | 1927/18178 | Admission of Maltese into Commonwealth | 1920 - 1927 | 1920-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
646 | 408106 | B13 | 1927/341 | Salvatore Camillori: departure per S.S. "Regin... | 1926 - 1927 | 1926-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
647 | 408111 | B13 | 1927/2190 | Carmelo Meilak, Maltese - Deported per R.M.S. ... | 1927 - 1927 | 1927-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
648 | 408115 | B13 | 1925/7385 | Arrival of 34 Maltese on "Ville De Verdun", "R... | 1925 - 1925 | 1925-01-01 | 1925-01-01 | Open | Melbourne | False | 0 |
649 | 408125 | B13 | 1926/10370 | Maltese passengers ex S.S. "Ville de Verdun" | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
650 | 408159 | B13 | 1923/6818 | Joseph Farregan, Maltese seaman ex S.S. "Gilgai" | 1923 - 1924 | 1923-01-01 | 1924-01-01 | Open | Melbourne | False | 0 |
651 | 408170 | B13 | 1925/4224 | Application by Joseph Cassar - to bring brothe... | 1925 - 1925 | 1925-01-01 | 1925-01-01 | Open | Melbourne | False | 0 |
652 | 408178 | B13 | 1929/7535 | Arrival S.S. "Citta di Genova", Apr. 1929: Thr... | 1929 - 1929 | 1929-01-01 | 1929-01-01 | Open | Melbourne | False | 0 |
653 | 408195 | B13 | 1926/3599 | Atto di Chiamata Forms - Italian passengers ex... | 1925 - 1926 | 1925-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
654 | 408223 | B13 | 1926/21135 | Atto Di Chiamata Forms for Italian passengers ... | 1925 - 1926 | 1925-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
655 | 408232 | B13 | 1928/1846 | Atto Di Chiamata Forms - Italian passengers ex... | 1926 - 1927 | 1926-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
656 | 408240 | B13 | 1926/794 | Kichiji Owa: Certificate of exemption - visit ... | 1925 - 1926 | 1925-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
657 | 408245 | B13 | 1926/4652 | Passenger on S.S."Regina D'Italia": Michele La... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
658 | 408252 | B13 | 1926/16579 | Sponsorship of Italian Migrant: Mrs Filomena G... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
659 | 408258 | B13 | 1926/16580 | Sponsorship of English Migrant: Mrs. Neilson | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
660 | 408266 | B13 | 1927/21931 | Prospective deportation of Italian Migrant: Gi... | 1927 - 1927 | 1927-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
20148 | 30762917 | B13 | 1925/6271 | Leslie Macdonald - arrival per Beltana 17 Augu... | 1925 - 1929 | 1925-01-01 | 1929-01-01 | Open | Melbourne | False | 0 |
20159 | 30762929 | B13 | 1922/9340 | Collector of Customs - Melbourne - Five Chines... | 1922 - 1922 | 1922-01-01 | 1922-01-01 | Open | Melbourne | False | 0 |
20160 | 30762930 | B13 | 1922/9690 | C.E.D.T Book 259 Number 72 relating to Chinese... | 1922 - 1922 | 1922-01-01 | 1922-01-01 | Open | Melbourne | False | 0 |
20162 | 30762936 | B13 | 1923/782 | William Dalton ex "Largs Bay"; Application for... | 1923 - 1923 | 1923-01-01 | 1923-01-01 | Open | Melbourne | False | 0 |
20163 | 30762937 | B13 | 1923/5170 | Report by the boarding inspector regarding E. ... | 1923 - 1923 | 1923-01-01 | 1923-01-01 | Open | Melbourne | False | 0 |
20164 | 30762939 | B13 | 1923/18291 | Mrs. Mary Ann Hamilton arriving Melbourne per ... | 1923 - 1924 | 1923-01-01 | 1924-01-01 | Open | Melbourne | False | 0 |
20165 | 30762940 | B13 | 1923/18459 | Restricted Passengers S.S. Ulysses; Miss Curry... | 1923 - 1923 | 1923-01-01 | 1923-01-01 | Open | Melbourne | False | 0 |
20166 | 30762942 | B13 | 1924/8009 | Ah hing and Tang Cheong; Deserters from the S.... | 1924 - 1924 | 1924-01-01 | 1924-01-01 | Open | Melbourne | False | 0 |
20167 | 30762943 | B13 | 1925/9274 | Letter of Admission for Miss Isabella Scott, T... | 1925 - 1925 | 1925-01-01 | 1925-01-01 | Open | Melbourne | False | 0 |
20168 | 30762944 | B13 | 1925/18878 | Mrs. Sarah Hayhurst; permission to disembark a... | 1925 - 1925 | 1925-01-01 | 1925-01-01 | Open | Melbourne | False | 0 |
20169 | 30762945 | B13 | 1925/19725 | William Dearing, Permission to disembark in Me... | 1925 - 1925 | 1925-01-01 | 1925-01-01 | Open | Melbourne | False | 0 |
20171 | 30762948 | B13 | 1925/28776 | Passengers of the S.S. Gascoyne; Luada, Bux, K... | 1925 - 1925 | 1925-01-01 | 1925-01-01 | Open | Melbourne | False | 0 |
20172 | 30762949 | B13 | 1926/5387 | Ruth Ellen ROgers, Auckland, New Zealand - Req... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20173 | 30762950 | B13 | 1926/5504 | Application for return of Passport/Permit - Go... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20174 | 30762952 | B13 | 1926/8214 | Atto di Chiamata forms for Italians ex "Orama"... | 1925 - 1926 | 1925-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20175 | 30762953 | B13 | 1926/10418 | Departure of A.Baldiserra aboard S.S. Orsova o... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20176 | 30762954 | B13 | 1926/12411 | William Mark Snow, Prohibited passanger aboard... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20177 | 30762955 | B13 | 1926/12533 | Thomas D.L. Canning; ex S.S. 'Esperance Bay' f... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20178 | 30762956 | B13 | 1926/18497 | Alexander Martin - Ex SS Balranald, boarded SS... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20179 | 30762957 | B13 | 1926/24036 | Ah Hing - C.E.D.T in favour, leaving port Melb... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20180 | 30762958 | B13 | 1926/25071 | Crispino Bedont - Leaving Commonwealth aboard ... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20182 | 30762960 | B13 | 1926/26816 | Ah Jick - Application for C.E.D.T | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20183 | 30762961 | B13 | 1926/29708 | Wong Kie; Application for C.E.D.T [Includes tw... | 1926 - 1926 | 1926-01-01 | 1926-01-01 | Open | Melbourne | False | 0 |
20184 | 30762962 | B13 | 1927/18772 | T. Sugimoto ex S.S. "Aki Maru" | 1927 - 1927 | 1927-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
20187 | 30783406 | B13 | 1927/26439 | Restricted persons - RMS Mooltan | 1927 - 1927 | 1927-01-01 | 1927-01-01 | Open | Melbourne | False | 0 |
20188 | 30783541 | B13 | 1925/1448 | Mrs Antonia Joseph - arrival and departure per... | 1925 - 1925 | 1925-01-01 | 1925-01-01 | Open | Melbourne | False | 0 |
20189 | 60184762 | B13 | 1922/7845 | Fazal Deen - application for extension of Cert... | 1922 - circa1922 | 1922-01-01 | 1922-01-01 | Open | Melbourne | True | 1 |
20190 | 60184763 | B13 | 1922/7845 | Fazal Deen - application for extension of Cert... | 1922 - circa1922 | 1922-01-01 | 1922-01-01 | Open | Melbourne | True | 1 |
20191 | 60184764 | B13 | 1922/7845 | Fazal Deen - application for extension of Cert... | 1922 - circa1922 | 1922-01-01 | 1922-01-01 | Open | Melbourne | False | 0 |
20192 | 60184765 | B13 | 1922/7845 | Fazal Deen - application for extension of Cert... | 1922 - circa1922 | 1922-01-01 | 1922-01-01 | Open | Melbourne | True | 1 |
9570 rows × 11 columns
# Import TextBlob for text analysis
from textblob import TextBlob
import nltk
stopwords = nltk.corpus.stopwords.words('english')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
blob = TextBlob(title_text)
words = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
word_counts = pd.DataFrame(words).rename({0: 'word', 1: 'count'}, axis=1).sort_values(by='count', ascending=False)
word_counts[:25].style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})
word | count | |
---|---|---|
7 | per | 5,172 |
314 | ex | 4,363 |
902 | exemption | 3,688 |
618 | certificate | 3,577 |
1655 | dictation | 3,577 |
1581 | test | 3,553 |
71 | melbourne | 3,168 |
538 | application | 2,442 |
174 | departure | 2,006 |
14 | australia | 1,977 |
1543 | ah | 1,796 |
949 | passengers | 1,620 |
173 | arrival | 1,560 |
26 | ltd | 1,446 |
104 | act | 1,180 |
1482 | mrs | 1,175 |
6 | sydney | 1,075 |
1861 | s.s | 1,074 |
12 | permission | 1,050 |
830 | crew | 1,015 |
25 | pty | 950 |
1621 | applied | 927 |
1583 | chinese | 862 |
2049 | enemy | 858 |
24 | co | 835 |
def get_ngram_counts(text, size):
blob = TextBlob(text)
# Extract n-grams as WordLists, then convert to a list of strings
ngrams = [' '.join(ngram).lower() for ngram in blob.lower().ngrams(size)]
# Convert to dataframe then count values and rename columns
ngram_counts = pd.DataFrame(ngrams)[0].value_counts().rename_axis('ngram').reset_index(name='count')
return ngram_counts
def display_top_ngrams(text, size):
ngram_counts = get_ngram_counts(text, 2)
# Display top 25 results as a bar chart
display(ngram_counts[:25].style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}))
display_top_ngrams(title_text, 2)
ngram | count | |
---|---|---|
0 | exemption from | 3,550 |
1 | from dictation | 3,540 |
2 | dictation test | 3,533 |
3 | for exemption | 3,004 |
4 | certificate for | 2,872 |
5 | for certificate | 2,660 |
6 | application for | 2,236 |
7 | melbourne per | 1,054 |
8 | departure per | 1,009 |
9 | pty ltd | 927 |
10 | applied for | 923 |
11 | to australia | 859 |
12 | trading with | 782 |
13 | enemy act | 765 |
14 | with enemy | 764 |
15 | test ah | 735 |
16 | permission to | 698 |
17 | act 1939 | 691 |
18 | of exemption | 636 |
19 | certificate of | 625 |
20 | crew member | 623 |
21 | arrival per | 458 |
22 | to enter | 437 |
23 | of certificate | 419 |
24 | passengers melbourne | 390 |
display_top_ngrams(title_text, 6)
ngram | count | |
---|---|---|
0 | exemption from | 3,550 |
1 | from dictation | 3,540 |
2 | dictation test | 3,533 |
3 | for exemption | 3,004 |
4 | certificate for | 2,872 |
5 | for certificate | 2,660 |
6 | application for | 2,236 |
7 | melbourne per | 1,054 |
8 | departure per | 1,009 |
9 | pty ltd | 927 |
10 | applied for | 923 |
11 | to australia | 859 |
12 | trading with | 782 |
13 | enemy act | 765 |
14 | with enemy | 764 |
15 | test ah | 735 |
16 | permission to | 698 |
17 | act 1939 | 691 |
18 | of exemption | 636 |
19 | certificate of | 625 |
20 | crew member | 623 |
21 | arrival per | 458 |
22 | to enter | 437 |
23 | of certificate | 419 |
24 | passengers melbourne | 390 |