#!/usr/bin/env python # coding: utf-8 # # Test predictions # In[1]: import pandas as pd import arrow from IPython.display import display, HTML # Can we fill in the gaps? Looking at the beginning and end of 1929, it seems the number of pages for each session on a weekday are consistent: # # * Morning: 1 page # * Noon: 2 pages # * Afternoon: 2 pages # # So 5 pages per weekday. Saturdays have 2 pages. # # **Notes from when Maggie and I looked at the bound volumes** # # 1901 – five pages a day, 1 for mining forenoon, 2 for noon and 2 for afternoon – mix of small and large pages # # # # 1915 - five pages a day, 1 for mining forenoon, 2 for noon and 2 for afternoon – mix of small and large pages, some tiny font used in miscellaneous which continues # # # # 1924 - five pages a day, 1 for mining forenoon, 2 for noon and 2 for afternoon – only large pages, handwritten ‘investment sales’ # # # # 1934 - five pages a day, 1 for mining forenoon, 2 for noon and 2 for afternoon – only large pages, handwritten ‘investment sales’ # # # # 1940 – nine pages a day, 3 forenoon, 3 noon and 3 afternoon, Saturdays only have 3 pages for morning (we didn’t record how many pages for Saturday before) # # # # 1950 – eight pages a day, 4 for morning and 4 for afternoon, thinner paper, no Saturday trading # # There's an increase in pages after vol 135, in 1934. # In[2]: df_series = pd.read_csv('series_list.csv') all_holidays = pd.read_csv('nsw_holidays_1900_1950.csv') #all_holidays.loc[:, 'date'] = pd.to_datetime(all_holidays.loc[:, 'date'], errors='coerce') def get_holidays(year): holidays = all_holidays.loc[all_holidays['year'] == year]['date'] return holidays.to_list() # In[3]: def daterange(start_date, end_date): for n in range(int ((end_date - start_date).days)): yield start_date.shift(days=+n) # In[4]: def predict_pages(start_date, end_date, weekday_pages=5, saturday_pages=2, include_saturday=True): pages = 0 year = start_date.year holidays = sorted(get_holidays(year)) for single_date in daterange(start_date, end_date): if single_date.format('YYYY-MM-DD') not in holidays and single_date.weekday() != 6: if single_date.weekday() == 5: if include_saturday is True: pages += saturday_pages else: pages += weekday_pages return pages # In[5]: pages_per_vol = { '1_134': { 'weekday': 5, 'saturday': 2 }, '135_145': { 'weekday': 6, 'saturday': 2 }, '146_164': { 'weekday': 9, 'saturday': 3 }, '165_190': { 'weekday': 6, 'saturday': 3 }, '191_199': { 'weekday': 8, 'saturday': 0 } } def get_pages(vol_num): for key, pages in pages_per_vol.items(): vols = key.split('_') vols = [int(y) for y in vols] if len(vols) == 2: vols = list(range(vols[0], vols[1] + 1)) if vol_num in vols: return pages # In[6]: predictions = [] for row in df_series.dropna(subset=['start_date', 'end_date'])[:-1].itertuples(): start_date = arrow.get(row.start_date, 'YYYY-MM-DD') end_date = arrow.get(row.end_date, 'YYYY-MM-DD').shift(days=+1) vol_num = int(row.Item_number.split('-')[-1]) pages_vol = get_pages(vol_num) prediction = predict_pages(start_date, end_date, weekday_pages=pages_vol['weekday'], saturday_pages=pages_vol['saturday']) #print(f'Volume {row.Item_number}: {prediction} predicted / {row.Pages} actual pages') predictions.append({'volume': row.Item_number, 'predicted': prediction, 'actual': row.Pages}) df_predictions = pd.DataFrame(predictions) df_predictions.loc[:, 'difference'] = df_predictions['predicted'] - df_predictions['actual'] # In[7]: pd.set_option("display.max_rows", 200) import seaborn as sns cm = sns.light_palette("green", as_cmap=True) df_predictions.style.background_gradient(cmap=cm, subset=pd.IndexSlice[:, ['difference']]) # In[ ]: