import pandas as pd
import arrow
import re
from IPython.display import display, HTML
import altair as alt
df_series = pd.read_csv('series_list.csv')[:-1]
all_holidays = pd.read_csv('nsw_holidays_1900_1950.csv')
#all_holidays.loc[:, 'date'] = pd.to_datetime(all_holidays.loc[:, 'date'], errors='coerce')
def get_holidays(year):
holidays = all_holidays.loc[all_holidays['year'] == year]['date']
return holidays.to_list()
def daterange(start_date, end_date):
for n in range(int ((end_date - start_date).days)):
yield start_date.shift(days=+n)
missing = {
'1901-01-07': 3,
'1901-01-18': 4,
'1901-01-23': 0, # Death of the Queen business abandoned https://trove.nla.gov.au/newspaper/article/14371864/1343690
'1901-02-25': 4,
'1901-03-18': 0,
'1901-03-29': 0, # missing
'1901-04-04': 3, # No afternoon, day before Easter
'1901-04-09': 0, # Extra Easter Tuesday
'1901-04-10': 0, # Extra Easter Wednesday
'1901-05-27': 0, # Holiday Duke of Cornwall visiting
'1901-05-28': 0, # Holiday Duke of Cornwall visiting
'1901-07-03': 0, # Holiday for polling day
'1901-09-16': 4, # No morning
'1901-10-10': 4, # 1 Noon
'1901-10-30': 4, # 1 Noon
'1901-12-16': 2, # Noon only
'1902-02-26': 0, # ??
'1902-04-02': 3, # No afternboon
'1902-06-26': 0, # ??
'1902-08-09': 0, #??
'1902-10-17': 6, # 008_0063 is a duplicate
'1903-01-06': 4, # 1 afternoon missing
'1903-01-09': 4, # morning missing
'1903-04-09': 3, # No afternoon, day before Easter
'1903-04-14': 0, # Easter Tuesday
# 1903-09-02 has no morning, but 3 noons
'1903-09-08': 4, # no morning
# 1903-09-16 has no morning, but 3 noons
'1903-10-01': 3, # no afternoon
'1903-11-18': 3, # no morning, 1 noon -- see 219 and 220!
'1903-11-30': 7, # 2 sheets from 1903-11-18 inserted
'1903-12-16': 0, # ??
'1904-01-20': 3, # no afternoon
'1904-08-15': 3, # no afternoon
'1904-11-09': 6, # 016_145 is a duplicate
'1905-03-02': 6, # 017_213 is a duplicate
'1905-03-08': 6, # 017_239 is a duplicate
'1905-04-20': 3, # No afternoon, day before Easter
'1905-04-25': 0, # Easter Tuesday
'1905-04-26': 0, # Easter Wednesday
'1906-03-19': 6, # extra page, 282 is from 1906-03-21
'1906-03-21': 4, # 1 page included in 1906-03-19
'1906-04-02': 4, # 1 afternoon missing
'1906-04-06': 4, # 1 afternoon missing
'1906-04-09': 4, # 1 afternoon missing
'1906-04-10': 4, # 1 afternoon missing
'1906-04-11': 4, # 1 afternoon missing
'1906-04-12': 3, # No afternoon, day before Easter
'1906-04-17': 0, # Easter Tuesday
'1906-04-18': 0, # Easter Wednesday
'1906-04-25': 4, # 1 afternoon missing
'1906-05-02': 4, # 1 afternoon missing
'1906-05-03': 4, # 1 afternoon missing
'1906-07-12': 4, # 1 afternoon missing
'1906-07-16': 4, # 1 afternoon missing
'1906-10-25': 3, # Afternoon missing
'1907-02-02': 1, # Saturday 1 page only
'1907-03-08': 4, # 1 afternoon missing
'1907-04-29': 4, # 1 afternoon missing
'1907-06-27': 2, # 2 pages only marked '11 o'clock'
'1907-09-10': 3, # No afternoon
'1907-10-11': 4, # 1 afternoon missing
'1907-11-29': 4, # 1 afternoon missing
'1907-12-02': 4, # 1 afternoon missing
'1908-03-12': 4, # 1 afternoon missing
'1908-04-16': 3, # No afternoon, day before Easter
'1908-04-21': 0, # Easter Tuesday
'1908-08-20': 0, # American Fleet visit!
'1908-08-21': 3, # No morning?
'1908-08-24': 0, # American Fleet visit!
'1908-11-14': 1, # Saturday 1 page only
'1929-03-01': 4,
'1929-03-12': 3,
'1929-03-27': 3,
'1930-02-26': 3,
'1930-04-17': 3,
'1930-04-22': 0,
'1930-04-23': 0,
'1930-04-24': 0,
'1930-04-26': 0,
'1930-05-09': 3,
'1930-12-23': 3
}
duplicates = [
'008_0063',
'016_145',
'017_213',
'119_265'
]
backwards = [
'120'
]
pages_per_vol = {
'1_134': {
'weekday': 5,
'saturday': 2
},
'135_145': {
'weekday': 6,
'saturday': 2
},
'146_164': {
'weekday': 9,
'saturday': 3
},
'165_190': {
'weekday': 6,
'saturday': 3
},
'191_199': {
'weekday': 6,
'saturday': 0
},
}
def get_pages(vol_num):
for key, pages in pages_per_vol.items():
vols = key.split('_')
vols = [int(y) for y in vols]
if len(vols) == 2:
vols = list(range(vols[0], vols[1] + 1))
if vol_num in vols:
return pages
def calculate_date(image_name, start_date, end_date, holidays, weekday_pages, saturday_pages):
page_num = re.search(r'N193\-\d+_(\d+)', image_name).group(1)
pages = 0
for single_date in daterange(start_date, end_date):
if single_date.format('YYYY-MM-DD') in missing:
skip = missing[single_date.format('YYYY-MM-DD')]
elif single_date.weekday() == 6 or single_date.format('YYYY-MM-DD') in holidays:
skip = 0
elif single_date.weekday() == 5:
skip = 2
else:
skip = 5
for s in range(1, skip + 1):
pages += 1
if pages == int(page_num):
return single_date
def calculate_single_date(image_name):
'''
Get the date of a single page.
'''
vol_num, page_num = re.search(r'(N193\-\d+)_(\d+)', image_name).groups()
# df_series = pd.read_csv('series_list.csv').dropna(subset=['Item_number'])
volume = df_series.loc[df_series['Item_number'].str.contains(vol_num)].iloc[0]
start_date = arrow.get(volume['start_date'], 'YYYY-MM-DD')
end_date = arrow.get(volume['end_date'], 'YYYY-MM-DD').shift(days=+1)
year = start_date.year
holidays = sorted(get_holidays(year))
single_date = calculate_date(image_name, start_date, end_date, holidays)
return single_date
def save_dates(year):
'''
Assign dates to pages from a year.
'''
page_dates = []
df = pd.read_csv(f'{year}.csv')
# df_series = pd.read_csv('series_list.csv').dropna(subset=['Item_number'])
volumes = list(df['referenceCode'].unique())
holidays = get_holidays(year)
for vol_ref, rows in df.groupby(by='referenceCode'):
vol_num = int(vol_ref.split('-')[-1])
vol_pages = get_pages(vol_num)
volume = df_series.loc[df_series['Item_number'].str.contains(vol_ref)].iloc[0]
start_date = arrow.get(volume['start_date'], 'YYYY-MM-DD')
end_date = arrow.get(volume['end_date'], 'YYYY-MM-DD').shift(days=+1)
for row in rows.itertuples():
page_date = calculate_date(row.name, start_date, end_date, holidays, vol_pages['weekday'], vol_pages['saturday'])
page_dates.append({'name': row.name, 'page_date': page_date.format('YYYY-MM-DD')})
df_new = pd.merge(df, pd.DataFrame(page_dates), on='name')
return df_new
df_new = save_dates(1901)
df_new.head()
directory | name | path | referenceCode | startDate | endDate | year | width | height | columns | column_positions | page_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AU NBAC N193-001/ | N193-001_0001.tif | Shared/ANU-Library/Sydney Stock Exchange 1901-... | N193-001 | 1901-01-01 | 1901-03-01 | 1901 | 6237 | 5000 | 3 | 0,1811,3222 | 1901-01-07 |
1 | AU NBAC N193-001/ | N193-001_0002.tif | Shared/ANU-Library/Sydney Stock Exchange 1901-... | N193-001 | 1901-01-01 | 1901-03-01 | 1901 | 6266 | 5000 | 3 | 205,1840,3259 | 1901-01-07 |
2 | AU NBAC N193-001/ | N193-001_0003.tif | Shared/ANU-Library/Sydney Stock Exchange 1901-... | N193-001 | 1901-01-01 | 1901-03-01 | 1901 | 6237 | 5000 | 2 | 286,2068 | 1901-01-07 |
3 | AU NBAC N193-001/ | N193-001_0004.tif | Shared/ANU-Library/Sydney Stock Exchange 1901-... | N193-001 | 1901-01-01 | 1901-03-01 | 1901 | 6236 | 5000 | 3 | 9,1821,3219 | 1901-01-08 |
4 | AU NBAC N193-001/ | N193-001_0005.tif | Shared/ANU-Library/Sydney Stock Exchange 1901-... | N193-001 | 1901-01-01 | 1901-03-01 | 1901 | 6236 | 5000 | 3 | 288,1821,3220 | 1901-01-08 |
from math import ceil
def week_of_month(dt):
""" Returns the week of the month for the specified date.
"""
first_day = dt.replace(day=1)
dom = dt.day
adjusted_dom = dom + first_day.weekday()
return int(ceil(adjusted_dom/7.0))
def make_year_chart(df, year):
df.loc[:, 'page_date'] = pd.to_datetime(df.loc[:, 'page_date'], errors='coerce')
df.loc[:, 'week'] = df.loc[:, 'page_date'].apply(week_of_month)
chart = alt.Chart(df).mark_rect(stroke='white').encode(
x=alt.X('day(page_date):O', title='day'),
y='week:O',
facet=alt.Facet('month(page_date):T', title=''),
color=alt.Color('count():Q', scale=alt.Scale(scheme='greenblue')),
tooltip=['page_date:T', 'count():Q']
).properties(columns=4, title=str(year))
return chart
def display_years(years=list(range(1901, 1909))):
all_charts = alt.vconcat()
for year in years:
df = save_dates(year)
all_charts &= make_year_chart(df, year)
display(all_charts)
display_years()
def save_years(years=list(range(1901, 1909))):
for year in years:
df = save_dates(year)
df.to_csv(f'{year}_with_dates.csv')
save_years()