This notebook compares predictions about the dates of individual pages with images of the actual pages. Put simply, it helps you quickly work out where your predictions are going wrong.
import pandas as pd
import arrow
from IPython.display import display, HTML
series = pd.read_csv('series_list.csv')[:-1]
all_holidays = pd.read_csv('nsw_holidays_1900_1950.csv')
#all_holidays.loc[:, 'date'] = pd.to_datetime(all_holidays.loc[:, 'date'], errors='coerce')
def get_holidays(year):
holidays = all_holidays.loc[all_holidays['year'] == year]['date']
return holidays.to_list()
pages_per_vol = {
'1_134': {
'weekday': 5,
'saturday': 2
},
'135_145': {
'weekday': 6,
'saturday': 2
},
'146_164': {
'weekday': 9,
'saturday': 3
},
'165_190': {
'weekday': 6,
'saturday': 3
},
'191_199': {
'weekday': 8,
'saturday': 0
}
}
def get_pages(vol_num):
for key, pages in pages_per_vol.items():
vols = key.split('_')
vols = [int(y) for y in vols]
if len(vols) == 2:
vols = list(range(vols[0], vols[1] + 1))
if vol_num in vols:
return pages
def predict_pages(start_date, end_date, weekday_pages=5, saturday_pages=2, include_saturday=True):
pages = 0
year = start_date.year
holidays = sorted(get_holidays(year))
for single_date in daterange(start_date, end_date):
if single_date not in holidays and single_date.weekday() != 6:
if single_date.weekday() == 5:
if include_saturday is True:
pages += saturday_pages
else:
pages += weekday_pages
return pages
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date.shift(days=+n)
def test_volume(volume, weekday_pages=5, saturday_pages=2, include_saturday=True):
vol_num = int(volume.split('-')[1])
volume_details = series.dropna(subset=['Item_number']).loc[series['Item_number'].str.endswith(volume)].iloc[0]
start_date = arrow.get(volume_details['start_date'], 'YYYY-MM-DD')
end_date = arrow.get(volume_details['end_date'], 'YYYY-MM-DD').shift(days=+1)
pages_vol = get_pages(vol_num)
year = start_date.year
holidays = get_holidays(year)
print(holidays)
page = 1
for v_date in daterange(start_date, end_date):
v_date_iso = v_date.format('YYYY-MM-DD')
print(v_date_iso)
if v_date_iso not in holidays and v_date.weekday() != 6 and not (v_date.format('YYYY-MM-DD') in missing and missing[v_date.format('YYYY-MM-DD')] == 0):
if v_date_iso in missing:
page += missing[v_date_iso]
elif v_date.weekday() == 5:
if include_saturday is True:
page += pages_vol['saturday']
else:
page += pages_vol['weekday']
next_date = v_date.shift(days=+1)
while next_date.format('YYYY-MM-DD') in holidays or next_date.weekday() == 6 or (next_date.format('YYYY-MM-DD') in missing and missing[next_date.format('YYYY-MM-DD')] == 0):
next_date = next_date.shift(days=+1)
print(f'Expected date: {next_date.format("D MMMM YYYY")} / Page N193-{vol_num:03}_{page:04}')
display(HTML(f'<img src="all_headers/AU NBAC N193-{vol_num:03}/N193-{vol_num:03}_{page:04}-header.jpg">'))
# What do we do with duplicates?
The cell below contains information about adjustments that need to be made based on the testing. Once you've found a problem, you record the adjustment and run the test again.
missing = {
'1901-01-07': 3,
'1901-01-18': 4,
'1901-01-23': 0, # Death of the Queen business abandoned https://trove.nla.gov.au/newspaper/article/14371864/1343690
'1901-02-25': 4,
'1901-03-18': 0,
'1901-03-29': 0, # missing
'1901-04-04': 3, # No afternoon, day before Easter
'1901-04-09': 0, # Extra Easter Tuesday
'1901-04-10': 0, # Extra Easter Wednesday
'1901-05-27': 0, # Holiday Duke of Cornwall visiting
'1901-05-28': 0, # Holiday Duke of Cornwall visiting
'1901-07-03': 0, # Holiday for polling day
'1901-09-16': 4, # No morning
'1901-10-10': 4, # 1 Noon
'1901-10-30': 4, # 1 Noon
'1901-12-16': 2, # Noon only
'1902-02-26': 0, # ??
'1902-04-02': 3, # No afternboon
'1902-06-26': 0, # ??
'1902-08-09': 0, #??
'1902-10-17': 6, # 008_0063 is a duplicate
'1903-01-06': 4, # 1 afternoon missing
'1903-01-09': 4, # morning missing
'1903-04-09': 3, # No afternoon, day before Easter
'1903-04-14': 0, # Easter Tuesday
# 1903-09-02 has no morning, but 3 noons
'1903-09-08': 4, # no morning
# 1903-09-16 has no morning, but 3 noons
'1903-10-01': 3, # no afternoon
'1903-11-18': 3, # no morning, 1 noon -- see 219 and 220!
'1903-11-30': 7, # 2 sheets from 1903-11-18 inserted
'1903-12-16': 0, # ??
'1904-01-20': 3, # no afternoon
'1904-08-15': 3, # no afternoon
'1904-11-09': 6, # 016_145 is a duplicate
'1905-03-02': 6, # 017_213 is a duplicate
'1905-03-08': 6, # 017_239 is a duplicate
'1905-04-20': 3, # No afternoon, day before Easter
'1905-04-25': 0, # Easter Tuesday
'1905-04-26': 0, # Easter Wednesday
'1906-03-19': 6, # extra page, 282 is from 1906-03-21
'1906-03-21': 4, # 1 page included in 1906-03-19
'1906-04-02': 4, # 1 afternoon missing
'1906-04-06': 4, # 1 afternoon missing
'1906-04-09': 4, # 1 afternoon missing
'1906-04-10': 4, # 1 afternoon missing
'1906-04-11': 4, # 1 afternoon missing
'1906-04-12': 3, # No afternoon, day before Easter
'1906-04-17': 0, # Easter Tuesday
'1906-04-18': 0, # Easter Wednesday
'1906-04-25': 4, # 1 afternoon missing
'1906-05-02': 4, # 1 afternoon missing
'1906-05-03': 4, # 1 afternoon missing
'1906-07-12': 4, # 1 afternoon missing
'1906-07-16': 4, # 1 afternoon missing
'1906-10-25': 3, # Afternoon missing
'1907-02-02': 1, # Saturday 1 page only
'1907-03-08': 4, # 1 afternoon missing
'1907-04-29': 4, # 1 afternoon missing
'1907-06-27': 2, # 2 pages only marked '11 o'clock'
'1907-09-10': 3, # No afternoon
'1907-10-11': 4, # 1 afternoon missing
'1907-11-29': 4, # 1 afternoon missing
'1907-12-02': 4, # 1 afternoon missing
'1908-03-12': 4, # 1 afternoon missing
'1908-04-16': 3, # No afternoon, day before Easter
'1908-04-21': 0, # Easter Tuesday
'1908-08-20': 0, # American Fleet visit!
'1908-08-21': 3, # No morning?
'1908-08-24': 0, # American Fleet visit!
'1908-11-14': 1, # Saturday 1 page only
'1929-03-01': 4,
'1929-03-12': 3,
'1929-03-27': 3,
'1930-02-26': 3,
'1930-04-17': 3,
'1930-04-22': 0,
'1930-04-23': 0,
'1930-04-24': 0,
'1930-04-26': 0,
'1930-05-09': 3,
'1930-12-23': 3
}
duplicates = [
'008_0063',
'016_145',
'017_213',
'119_265'
]
backwards = [
'120'
]
Ignore the final row, it's not an error.
test_volume('N193-146')
['1937-01-01', '1937-02-01', '1937-03-26', '1937-03-27', '1937-03-29', '1937-04-26', '1937-05-12', '1937-08-02', '1937-10-04', '1937-12-25', '1937-12-27'] 1937-04-01 Expected date: 2 April 1937 / Page N193-146_0010
1937-04-02 Expected date: 3 April 1937 / Page N193-146_0019
1937-04-03 Expected date: 5 April 1937 / Page N193-146_0022
1937-04-04 1937-04-05 Expected date: 6 April 1937 / Page N193-146_0031
1937-04-06 Expected date: 7 April 1937 / Page N193-146_0040
1937-04-07 Expected date: 8 April 1937 / Page N193-146_0049
1937-04-08 Expected date: 9 April 1937 / Page N193-146_0058
1937-04-09 Expected date: 10 April 1937 / Page N193-146_0067
1937-04-10 Expected date: 12 April 1937 / Page N193-146_0070
1937-04-11 1937-04-12 Expected date: 13 April 1937 / Page N193-146_0079
1937-04-13 Expected date: 14 April 1937 / Page N193-146_0088
1937-04-14 Expected date: 15 April 1937 / Page N193-146_0097
1937-04-15 Expected date: 16 April 1937 / Page N193-146_0106
1937-04-16 Expected date: 17 April 1937 / Page N193-146_0115
1937-04-17 Expected date: 19 April 1937 / Page N193-146_0118
1937-04-18 1937-04-19 Expected date: 20 April 1937 / Page N193-146_0127
1937-04-20 Expected date: 21 April 1937 / Page N193-146_0136
1937-04-21 Expected date: 22 April 1937 / Page N193-146_0145
1937-04-22 Expected date: 23 April 1937 / Page N193-146_0154
1937-04-23 Expected date: 24 April 1937 / Page N193-146_0163
1937-04-24 Expected date: 27 April 1937 / Page N193-146_0166
1937-04-25 1937-04-26 1937-04-27 Expected date: 28 April 1937 / Page N193-146_0175
1937-04-28 Expected date: 29 April 1937 / Page N193-146_0184
1937-04-29 Expected date: 30 April 1937 / Page N193-146_0193
1937-04-30 Expected date: 1 May 1937 / Page N193-146_0202
1937-05-01 Expected date: 3 May 1937 / Page N193-146_0205
1937-05-02 1937-05-03 Expected date: 4 May 1937 / Page N193-146_0214
1937-05-04 Expected date: 5 May 1937 / Page N193-146_0223
1937-05-05 Expected date: 6 May 1937 / Page N193-146_0232
1937-05-06 Expected date: 7 May 1937 / Page N193-146_0241
1937-05-07 Expected date: 8 May 1937 / Page N193-146_0250
1937-05-08 Expected date: 10 May 1937 / Page N193-146_0253
1937-05-09 1937-05-10 Expected date: 11 May 1937 / Page N193-146_0262
1937-05-11 Expected date: 13 May 1937 / Page N193-146_0271
1937-05-12 1937-05-13 Expected date: 14 May 1937 / Page N193-146_0280
1937-05-14 Expected date: 15 May 1937 / Page N193-146_0289
1937-05-15 Expected date: 17 May 1937 / Page N193-146_0292
1937-05-16 1937-05-17 Expected date: 18 May 1937 / Page N193-146_0301
1937-05-18 Expected date: 19 May 1937 / Page N193-146_0310
1937-05-19 Expected date: 20 May 1937 / Page N193-146_0319
1937-05-20 Expected date: 21 May 1937 / Page N193-146_0328
1937-05-21 Expected date: 22 May 1937 / Page N193-146_0337
1937-05-22 Expected date: 24 May 1937 / Page N193-146_0340
1937-05-23 1937-05-24 Expected date: 25 May 1937 / Page N193-146_0349
1937-05-25 Expected date: 26 May 1937 / Page N193-146_0358
1937-05-26 Expected date: 27 May 1937 / Page N193-146_0367
1937-05-27 Expected date: 28 May 1937 / Page N193-146_0376
1937-05-28 Expected date: 29 May 1937 / Page N193-146_0385
1937-05-29 Expected date: 31 May 1937 / Page N193-146_0388
1937-05-30 1937-05-31 Expected date: 1 June 1937 / Page N193-146_0397
1937-06-01 Expected date: 2 June 1937 / Page N193-146_0406
1937-06-02 Expected date: 3 June 1937 / Page N193-146_0415
1937-06-03 Expected date: 4 June 1937 / Page N193-146_0424
1937-06-04 Expected date: 5 June 1937 / Page N193-146_0433
1937-06-05 Expected date: 7 June 1937 / Page N193-146_0436
1937-06-06 1937-06-07 Expected date: 8 June 1937 / Page N193-146_0445
1937-06-08 Expected date: 9 June 1937 / Page N193-146_0454
1937-06-09 Expected date: 10 June 1937 / Page N193-146_0463
1937-06-10 Expected date: 11 June 1937 / Page N193-146_0472
1937-06-11 Expected date: 12 June 1937 / Page N193-146_0481
1937-06-12 Expected date: 14 June 1937 / Page N193-146_0484
1937-06-13 1937-06-14 Expected date: 15 June 1937 / Page N193-146_0493
1937-06-15 Expected date: 16 June 1937 / Page N193-146_0502
1937-06-16 Expected date: 17 June 1937 / Page N193-146_0511
1937-06-17 Expected date: 18 June 1937 / Page N193-146_0520
1937-06-18 Expected date: 19 June 1937 / Page N193-146_0529
1937-06-19 Expected date: 21 June 1937 / Page N193-146_0532
1937-06-20 1937-06-21 Expected date: 22 June 1937 / Page N193-146_0541
1937-06-22 Expected date: 23 June 1937 / Page N193-146_0550
1937-06-23 Expected date: 24 June 1937 / Page N193-146_0559
1937-06-24 Expected date: 25 June 1937 / Page N193-146_0568
1937-06-25 Expected date: 26 June 1937 / Page N193-146_0577
1937-06-26 Expected date: 28 June 1937 / Page N193-146_0580
1937-06-27 1937-06-28 Expected date: 29 June 1937 / Page N193-146_0589
1937-06-29 Expected date: 30 June 1937 / Page N193-146_0598
1937-06-30 Expected date: 1 July 1937 / Page N193-146_0607
# Rename vol 14
import os
from pathlib import Path
img_dir = Path('all_headers/AU NBAC N193-014')
images = img_dir.glob('*.jpg')
for image in images:
parts = image.name.split('-')
new_name = Path(f'all_headers/AU NBAC N193-014/{parts[0]}-0{parts[1]}-{parts[2]}')
image.rename(new_name)
# Rename vol 18
import os
from pathlib import Path
img_dir = Path('all_headers/AU NBAC N193-018')
images = img_dir.glob('*.jpg')
for image in images:
new_name = Path(f'all_headers/AU NBAC N193-018/{image.name.replace(".-", "-")}')
image.rename(new_name)