#!/usr/bin/env python
# coding: utf-8

# # Test predictions

# In[1]:


import pandas as pd
import arrow
from IPython.display import display, HTML


# Can we fill in the gaps? Looking at the beginning and end of 1929, it seems the number of pages for each session on a weekday are consistent:
# 
# * Morning: 1 page
# * Noon: 2 pages
# * Afternoon: 2 pages
# 
# So 5 pages per weekday. Saturdays have 2 pages. 
# 
# **Notes from when Maggie and I looked at the bound volumes**
# 
# 1901 – five pages a day, 1 for mining forenoon, 2 for noon and 2 for afternoon – mix of small and large pages
# 
#  
# 
# 1915 - five pages a day, 1 for mining forenoon, 2 for noon and 2 for afternoon – mix of small and large pages, some tiny font used in miscellaneous which continues
# 
#  
# 
# 1924 - five pages a day, 1 for mining forenoon, 2 for noon and 2 for afternoon – only large pages, handwritten ‘investment sales’
# 
#  
# 
# 1934 - five pages a day, 1 for mining forenoon, 2 for noon and 2 for afternoon – only large pages, handwritten ‘investment sales’
# 
#  
# 
# 1940 – nine pages a day, 3 forenoon, 3 noon and 3 afternoon, Saturdays only have 3 pages for morning (we didn’t record how many pages for Saturday before)
# 
#  
# 
# 1950 – eight pages a day, 4 for morning and 4 for afternoon, thinner paper, no Saturday trading
# 
# There's an increase in pages after vol 135, in 1934.

# In[2]:


df_series = pd.read_csv('series_list.csv')
all_holidays = pd.read_csv('nsw_holidays_1900_1950.csv')
#all_holidays.loc[:, 'date'] = pd.to_datetime(all_holidays.loc[:, 'date'], errors='coerce')

def get_holidays(year):
    holidays = all_holidays.loc[all_holidays['year'] == year]['date']
    return holidays.to_list()


# In[3]:


def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date.shift(days=+n)


# In[4]:


def predict_pages(start_date, end_date, weekday_pages=5, saturday_pages=2, include_saturday=True):
    pages = 0
    year = start_date.year
    holidays = sorted(get_holidays(year))
    for single_date in daterange(start_date, end_date):
        if single_date.format('YYYY-MM-DD') not in holidays and single_date.weekday() != 6:
            if single_date.weekday() == 5:
                if include_saturday is True:
                    pages += saturday_pages
            else:
                pages += weekday_pages
    return pages


# In[5]:


pages_per_vol = {
    '1_134': {
        'weekday': 5,
        'saturday': 2
    },
    '135_145': {
        'weekday': 6,
        'saturday': 2
    },
    '146_164': {
        'weekday': 9,
        'saturday': 3
    },
    '165_190': {
        'weekday': 6,
        'saturday': 3
    },
    '191_199': {
        'weekday': 8,
        'saturday': 0
    }
}

def get_pages(vol_num):
    for key, pages in pages_per_vol.items():
        vols = key.split('_')
        vols = [int(y) for y in vols]
        if len(vols) == 2:
            vols = list(range(vols[0], vols[1] + 1))
        if vol_num in vols:
            return pages


# In[6]:


predictions = []
for row in df_series.dropna(subset=['start_date', 'end_date'])[:-1].itertuples():
    start_date = arrow.get(row.start_date, 'YYYY-MM-DD')
    end_date = arrow.get(row.end_date, 'YYYY-MM-DD').shift(days=+1)
    vol_num = int(row.Item_number.split('-')[-1])
    pages_vol = get_pages(vol_num)
    prediction = predict_pages(start_date, end_date, weekday_pages=pages_vol['weekday'], saturday_pages=pages_vol['saturday'])
    #print(f'Volume {row.Item_number}: {prediction} predicted / {row.Pages} actual pages')
    predictions.append({'volume': row.Item_number, 'predicted': prediction, 'actual': row.Pages})
df_predictions = pd.DataFrame(predictions)
df_predictions.loc[:, 'difference'] = df_predictions['predicted'] - df_predictions['actual']


# In[7]:


pd.set_option("display.max_rows", 200)
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
df_predictions.style.background_gradient(cmap=cm, subset=pd.IndexSlice[:, ['difference']])


# In[ ]: