series = 'SP42/1'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 16,256 |
---|---|
Access status | |
Open | 15,525 (95.50%) |
Not yet examined | 731 (4.50%) |
Number of items digitised | 3,253 (20.01%) |
Number of pages digitised | 45,862 |
Date of earliest content | 1881 |
Date of latest content | 1960 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1053878 | SP42/1 | B1906/694 | AH KIM [correspondence of the Collector of Customs relating to immigration restrictions] [6 pages] [box 15] | 1906 - 1906 | 1906-01-01 00:00:00 | 1906-01-01 00:00:00 | Open | Sydney | True | 6 |
1 | 1563661 | SP42/1 | B1905/1553 | Ah Kong, includes photographs | 1905 - circa1905 | 1905-01-01 00:00:00 | NaT | Open | Sydney | True | 10 |
2 | 1563665 | SP42/1 | B1905/1557 | Ah Yet, includes photographs | 1905 - 1905 | 1905-01-01 00:00:00 | 1905-01-01 00:00:00 | Open | Sydney | True | 12 |
3 | 1563670 | SP42/1 | B1905/1561 | You Gee | 1905 - 1905 | 1905-01-01 00:00:00 | 1905-01-01 00:00:00 | Open | Sydney | True | 8 |
4 | 1563675 | SP42/1 | B1905/1565 | Deserters from the RMS INDIA; Meer Afzul and Ackbar Carrandad | 1905 - 1905 | 1905-01-01 00:00:00 | 1905-01-01 00:00:00 | Open | Sydney | True | 17 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
10 | box | 14,077 |
13 | includes | 9,891 |
43 | left | 9,844 |
77 | prints | 7,827 |
32 | showing | 7,181 |
33 | front | 7,144 |
203 | ex | 6,808 |
34 | side | 6,695 |
35 | views | 6,590 |
14 | photographs | 6,521 |
1055 | thumb | 6,171 |
235 | right | 6,149 |
158 | sydney | 4,292 |
1047 | subject | 3,745 |
0 | ah | 3,585 |
39 | also | 3,409 |
40 | known | 3,318 |
1052 | arrived | 2,935 |
1063 | issue | 2,843 |
1064 | favour | 2,795 |
63 | certificate | 2,737 |
44 | hand | 2,635 |
31 | 2 | 2,503 |
240 | exemption | 2,371 |
1061 | finger | 1,978 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | showing front | 7,109 |
1 | and side | 6,660 |
2 | front and | 6,637 |
3 | side views | 6,554 |
4 | and left | 6,514 |
5 | and right | 6,139 |
6 | left and | 6,082 |
7 | photographs showing | 6,072 |
8 | thumb prints | 6,049 |
9 | right thumb | 6,005 |
10 | known as | 3,296 |
11 | also known | 3,293 |
12 | views and | 3,244 |
13 | sydney on | 3,098 |
14 | prints box | 2,994 |
15 | of subject | 2,898 |
16 | issue of | 2,836 |
17 | in favour | 2,790 |
18 | favour of | 2,778 |
19 | in sydney | 2,741 |
20 | arrived ex | 2,694 |
21 | subject box | 2,596 |
22 | left hand | 2,585 |
23 | prints and | 2,262 |
24 | of exemption | 2,052 |