In [410]:

from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
import plotly.offline as py
import cufflinks
from plotly import tools
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import time
import datetime

Import and format data¶

In [411]:

df = pd.read_csv('SEG_Saleshistory_Stores.csv')
df.columns = ['date', 'store', 'item', 'sales']
df.date = pd.to_datetime(df.date)
df.item = df.item.astype('category')
df.store = df.store.astype('category')
df.dtypes

Out[411]:

date     datetime64[ns]
store          category
item           category
sales             int64
dtype: object

Below are some summary statistics on the data. Overall the quantities for individual items at individual stores is quite small. It would be difficult to forecast daily quantities of individual items and individual stores, so we will work towards forecasting weekly item sales at individual stores.

In [412]:

df.describe(include='all')

Out[412]:

	date	store	item	sales
count	425575	425575	425575.0	425575.000000
unique	342	181	39.0	NaN
top	2016-11-05 00:00:00	SEGWD7	41795.0	NaN
freq	4110	6111	27516.0	NaN
first	2016-02-04 00:00:00	NaN	NaN	NaN
last	2017-01-11 00:00:00	NaN	NaN	NaN
mean	NaN	NaN	NaN	1.670345
std	NaN	NaN	NaN	1.112165
min	NaN	NaN	NaN	1.000000
25%	NaN	NaN	NaN	1.000000
50%	NaN	NaN	NaN	1.000000
75%	NaN	NaN	NaN	2.000000
max	NaN	NaN	NaN	112.000000

Let's take a look at a few cuts on the data to see if we can spot any trends. Below are plots of a handful of individual stores' sales. It looks like the answer to the question about the big jump in sales in September is the addition of a good number of stores.

Total Sales¶

In [413]:

df_total = df.groupby(pd.Grouper(freq='W', key='date')).sum().fillna(0).unstack('date', 0)
df_total.index.levels[1]

len(df_total) == len(df_total.index.levels[1])

trace = go.Scatter(
    x = df_total.index.levels[1],
    y = df_total
)

layout = go.Layout(
    title='Total Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='total-sales')

Individual Store Sales¶

In [414]:

df_1w = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)

rows = 10
cols = 3
spidx = np.arange(rows*cols).reshape(rows,cols)

fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])

for i in range(rows):

    for j in range(cols):

        trace = go.Scatter(
            x = df_1w.iloc[1].index.levels[1],
            y = df_1w.iloc[spidx[i,j]],
        )

        fig.append_trace(trace, i+1, j+1)

fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y1 ]     [ (1,3) x3,y1 ]   
[ (2,1) x4,y2 ]     [ (2,2) x5,y2 ]     [ (2,3) x6,y2 ]   
[ (3,1) x7,y3 ]     [ (3,2) x8,y3 ]     [ (3,3) x9,y3 ]   
[ (4,1) x10,y4 ]    [ (4,2) x11,y4 ]    [ (4,3) x12,y4 ]  
[ (5,1) x13,y5 ]    [ (5,2) x14,y5 ]    [ (5,3) x15,y5 ]  
[ (6,1) x16,y6 ]    [ (6,2) x17,y6 ]    [ (6,3) x18,y6 ]  
[ (7,1) x19,y7 ]    [ (7,2) x20,y7 ]    [ (7,3) x21,y7 ]  
[ (8,1) x22,y8 ]    [ (8,2) x23,y8 ]    [ (8,3) x24,y8 ]  
[ (9,1) x25,y9 ]    [ (9,2) x26,y9 ]    [ (9,3) x27,y9 ]  
[ (10,1) x28,y10 ]  [ (10,2) x29,y10 ]  [ (10,3) x30,y10 ]

New Stores¶

In [415]:

store_sales = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date')
stores_with_sales = store_sales['sales'].where(store_sales.sales > 0).count()

stores_with_sales.index

trace = go.Bar(
    x = stores_with_sales.index,
    y = stores_with_sales
)

layout = go.Layout(
    title='No. of Stores with Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='stores-with-sales')

Items¶

In [416]:

df_1w = df.groupby(['item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)
rows = 13
cols = 3

fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])

spidx = np.arange(rows*cols).reshape(rows,cols)

for i in range(rows):

    for j in range(cols):

        trace = go.Scatter(
            x = df_1w.iloc[1].index.levels[1],
            y = df_1w.iloc[spidx[i,j]],
        )

        fig.append_trace(trace, i+1, j+1)

fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y1 ]     [ (1,3) x3,y1 ]   
[ (2,1) x4,y2 ]     [ (2,2) x5,y2 ]     [ (2,3) x6,y2 ]   
[ (3,1) x7,y3 ]     [ (3,2) x8,y3 ]     [ (3,3) x9,y3 ]   
[ (4,1) x10,y4 ]    [ (4,2) x11,y4 ]    [ (4,3) x12,y4 ]  
[ (5,1) x13,y5 ]    [ (5,2) x14,y5 ]    [ (5,3) x15,y5 ]  
[ (6,1) x16,y6 ]    [ (6,2) x17,y6 ]    [ (6,3) x18,y6 ]  
[ (7,1) x19,y7 ]    [ (7,2) x20,y7 ]    [ (7,3) x21,y7 ]  
[ (8,1) x22,y8 ]    [ (8,2) x23,y8 ]    [ (8,3) x24,y8 ]  
[ (9,1) x25,y9 ]    [ (9,2) x26,y9 ]    [ (9,3) x27,y9 ]  
[ (10,1) x28,y10 ]  [ (10,2) x29,y10 ]  [ (10,3) x30,y10 ]
[ (11,1) x31,y11 ]  [ (11,2) x32,y11 ]  [ (11,3) x33,y11 ]
[ (12,1) x34,y12 ]  [ (12,2) x35,y12 ]  [ (12,3) x36,y12 ]
[ (13,1) x37,y13 ]  [ (13,2) x38,y13 ]  [ (13,3) x39,y13 ]

New Items¶

In [417]:

item_sales = df.groupby(['item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date')
items_with_sales = item_sales['sales'].where(item_sales.sales > 0).count()

items_with_sales.index

trace = go.Bar(
    x = items_with_sales.index,
    y = items_with_sales
)

layout = go.Layout(
    title='No. of Items with Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='items-with-sales')

Prepare Data¶

Trim date range to have consistent history¶

But before we do that, let's trim the range of the data to go from 2016-03-01 to 2016-12-15 in order to avoid anomolies in the data evident in the total sales graph above. We're going to trim to an approximate range here to reduce the amount of data we have to process. Once we aggregate into the order periods we will trim so that we end up with whole periods on either end of our data.

In [418]:

df = df[(df['date'] > '2016-02-15') & (df['date'] < '2016-12-30')]

Add records with zero sales so there is a record for every period for every store-item combination¶

We need to make sure that we add records with zero sales for any combination of store, item and period that doesn't appear in the dataset so that our trailing averages are calculated correctly.

In [419]:

import itertools

beg_date = df['date'].min()
end_date = df['date'].max()
dates = pd.DatetimeIndex(start=beg_date, end=end_date, freq='D')

items = df['item'].value_counts().index
stores = df['store'].value_counts().index
all_periods = pd.DataFrame(list(itertools.product(dates, stores, items)), columns=['date', 'store', 'item'])

all_periods['sales'] = 0
all_periods.date = pd.to_datetime(df.date)
all_periods.item = df.item.astype('category')
all_periods.store = df.store.astype('category')

incl_periods = df.groupby(['store', 'item', 'date']).sum().fillna(0).reset_index()

df_all = pd.concat([all_periods, df]).groupby(['store', 'item', 'date']).sum().fillna(0).reset_index()
len(df_all)

Out[419]:

Add Columns for Periods¶

This time we also want to predict sales for order periods of two and three times per week as well as weekly, assuming orders are placed on the same day each week. The day numbers start on Monday with 0 and end on Sunday with 6.

For the two orders per week periods, we will predict sales from:

Wednesday through Friday (days 2 through 4)
Saturday through Tuesday (days 5, 6, 0 and 1)

For the three orders per week periods, we will predict sales from:

Wednesday and Thursdsay (days 2 and 3)
Friday and Saturday (days 4 and 5)
Sunday through Tuesday (days 6, 0 and 1)

In order to aggregate sales over the correct periods we need to add columns to represent the series for each order period. The end of the two orders per week periods are created in the column freq2_end and the three orders per week periods are in freq3_end.

We are also adding in columns to distinguish between the intra-weekly periods (which have a zero index) in order to allow the model to compensate for differences in sales volumes between intra-weekly periods, which are freq2_per and freq3_per.

In [420]:

df_f = df_all.copy()

# Assign each record to its respective intra-week group.
#
# These are the intra-week periods that each day of the week belongs to.
freq = list([[0, 0, 1, 1, 1, 0, 0],
            [0, 0, 1, 1, 2, 2, 0]
            ])

# Map the day of the week of each record to its respective intra-week period.
for i, f in enumerate(freq):
    df_f['freq' + str(i + 2) + '_per'] = df_f['date'].dt.weekday.map(pd.Series(f))

# Assign each record to its respective group within each series of intra-week groups.
# Group membership is indicated in a separate column by the end date of the group.
#
#     1. Calculate the numeric day of the week for each date in the range of dates
#        in the data.
#     2. Create a boolean array with an entry for each record indicating whether
#        the date of the record falls on a day of the week on which an intra-week
#        period ends.
#     3. Calculate the cumulative sum of the boolean array for the range of dates, which
#        will then represent the sequential period each date in our range belongs to.
#     4. Index the cumulative sums by the range of dates to create a lookup table.
#     5. Map the 'date' column in our data to the sequence number using the lookup table.
#     6. Group the lookup table by the period, aggregating the date column by max, which
#        represents the end date of each sequential period, to create another lookup table.
#     7. Map the sequence number series we created earlier to the period ending date using
#        the new lookup table and add it to our data frame.
#

# These are the days of the week that new periods begin on for order frequencies of two
# and three times per week.
period_ends = list([[2, 5],
                   [2, 4, 6]
                   ])

# Execute the same process for each of our order frequencies
for i, p in enumerate(period_ends):
    # Steps 1 through 4
    periods = pd.Series(dates.weekday).isin(period_ends[i]).cumsum()
    date_lookup = pd.DataFrame({'date': dates, 'period': periods})
    date_lookup.set_index('date', inplace=True)
    
    # Step 5
    seq_col = df_f['date'].map(date_lookup.period)

    # Step 6
    period_lookup = date_lookup.reset_index().groupby('period').max()
    period_lookup.to_csv('freq' + str(i+2) + '.csv')

    # Step 7
    df_f['freq' + str(i+2) + '_end'] = seq_col.map(period_lookup.date)

df_f[df_f['date'] > '2016-10-06'].head(10)

Out[420]:

	store	item	date	sales	freq2_per	freq3_per	freq2_end	freq3_end
234	SEGWD103	41774	2016-10-07	2.0	1	2	2016-10-07	2016-10-08
235	SEGWD103	41774	2016-10-08	0.0	0	2	2016-10-11	2016-10-08
236	SEGWD103	41774	2016-10-09	0.0	0	0	2016-10-11	2016-10-11
237	SEGWD103	41774	2016-10-10	0.0	0	0	2016-10-11	2016-10-11
238	SEGWD103	41774	2016-10-11	0.0	0	0	2016-10-11	2016-10-11
239	SEGWD103	41774	2016-10-12	0.0	1	1	2016-10-14	2016-10-13
240	SEGWD103	41774	2016-10-13	2.0	1	1	2016-10-14	2016-10-13
241	SEGWD103	41774	2016-10-14	1.0	1	2	2016-10-14	2016-10-15
242	SEGWD103	41774	2016-10-15	1.0	0	2	2016-10-18	2016-10-15
243	SEGWD103	41774	2016-10-16	1.0	0	0	2016-10-18	2016-10-18

That looks like its working right.

Calculating trailing averages¶

Adding in the rolling average sales is more complicated because we want to calculate the average of like intra-week periods. For example, for the two orders per week frequency the first period spans from Friday through Monday. When we do our trailing averages we want to only include trailing Monday through Friday periods and exclude the Tuesday through Thursday periods that would be included in a strictly sequential calculation.

In [421]:

# This function calculates the trailing average for a given order frequency.
def calc_trailing(orders_per_week):
    # Check to make sure orders per week is in the available range
    if orders_per_week not in [2, 3]:
        print('Orders per week must be either 2 or 3.')
        raise
    
    freq_per = 'freq' + str(orders_per_week) + '_per'
    freq_end = 'freq' + str(orders_per_week) + '_end'
    freq_end_avg = freq_end + '_avg'
    
    f = {'sales': 'sum', freq_per: 'mean'}
    g = ['store', 'item', freq_end]
    
    # Here we filter the data frame for each of the intra-week periods in
    # the specified order frequency and perform the trailing average and
    # calculations on them separately.
    df_final = pd.DataFrame()
    for i, n in enumerate(df_f[freq_per].value_counts().index):
        df_model = df_f[df_f[freq_per] == n].groupby(g).agg(f).fillna(0)
        
        rolling_sum = (df_model
                       .apply(lambda x:x.rolling(window=3).mean())
                       .shift(1)
                      )

        df_model[freq_end_avg] = rolling_sum['sales']
        df_final = df_final.append(df_model.reset_index())
          
    return df_final.groupby(g).sum()

In [422]:

df_final = calc_trailing(3)
df_final.head(20)

Out[422]:

			sales	freq3_per	freq3_end_avg
store	item	freq3_end
SEGWD103	41774	2016-02-16	0.0	0	NaN
		2016-02-18	2.0	1	NaN
		2016-02-20	1.0	2	NaN
		2016-02-23	0.0	0	NaN
		2016-02-25	0.0	1	NaN
		2016-02-27	2.0	2	NaN
		2016-03-01	1.0	0	NaN
		2016-03-03	0.0	1	NaN
		2016-03-05	2.0	2	NaN
		2016-03-08	2.0	0	0.333333
		2016-03-10	2.0	1	0.666667
		2016-03-12	1.0	2	1.666667
		2016-03-15	2.0	0	1.000000
		2016-03-17	1.0	1	0.666667
		2016-03-19	2.0	2	1.666667
		2016-03-22	3.0	0	1.666667
		2016-03-24	0.0	1	1.000000
		2016-03-26	2.0	2	1.666667
		2016-03-29	4.0	0	2.333333
		2016-03-31	0.0	1	1.000000

Remove Missing Stores and Items¶

In [423]:

def remove_missing(df_final):
    beg_len = len(df_final)
    df_final['cum_sales'] = df_final.groupby(level=[0,1]).cumsum()['sales']
    df_final_masked = df_final[df_final.cum_sales != 0].dropna(how='any')
    
    print('{} records removed'.format(beg_len - len(df_final_masked)))
    print('{} records remaining'.format(len(df_final_masked)))
    
    return df_final_masked.reset_index()

In [424]:

df_final_masked = remove_missing(df_final)
store = 'SEGWD104'
item = 41783
df_final_masked.query('(store == @store) & (item == @item)').head()

487591 records removed
479492 records remaining

Out[424]:

	store	item	freq3_end	sales	freq3_per	freq3_end_avg	cum_sales
4501	SEGWD104	41783	2016-08-30	1.0	0	0.000000	1.0
4502	SEGWD104	41783	2016-09-01	2.0	1	0.000000	3.0
4503	SEGWD104	41783	2016-09-03	1.0	2	0.000000	4.0
4504	SEGWD104	41783	2016-09-06	5.0	0	0.333333	9.0
4505	SEGWD104	41783	2016-09-08	0.0	1	0.666667	9.0

Encode Store and Item Categories¶

In [425]:

# We have some extra logic here to deal with needing to categorize the three order
# per week variable, but not the two, which is already binary.
def encode_cat_vars(df_final_masked):
  
    # Make sure sales is the first column
    cols = df_final_masked.columns.tolist()
    cols.remove('sales')
    cols = ['sales'] + cols    
    df_final_masked = df_final_masked[cols]
    
    # Initial list of columns to drop
    drop_cols = ['cum_sales', 41793, 'SEGWD103']

    # Initial dummy variables
    stores = pd.get_dummies(df_final_masked['store'])
    items = pd.get_dummies(df_final_masked['item'])
    concat_tables = [df_final_masked, stores, items]
    
    # Create dummy variables for freq3_end if necessary
    try:
        freq = cols[cols.index('freq2_end')][:5]
    except:
        freq = 'freq3'
        freq3_d = pd.get_dummies(df_final_masked['freq3_per'], prefix='freq3_per')
        concat_tables.append(freq3_d)
        drop_cols.append(freq + '_per_2')
       
    # Add dummy variables
    df_final_masked = pd.concat(concat_tables, axis=1)
    
    # Drop columns and return
    return df_final_masked.drop(drop_cols, axis=1).sort_values(freq + '_end')

In [426]:

data = encode_cat_vars(df_final_masked)
data.head()

Out[426]:

	sales	store	item	freq3_end	freq3_end_avg	...	freq3_per_0
410400	2.0	SEGWD671	41797	2016-02-16	3.000000	...	1
49344	2.0	SEGWD151	41791	2016-02-16	1.000000	...	1
86661	2.0	SEGWD1690	41793	2016-02-16	2.333333	...	1
393506	1.0	SEGWD649	41795	2016-02-16	0.333333	...	1
366425	3.0	SEGWD54	41797	2016-02-16	2.000000	...	1

5 rows × 226 columns

Split Data for Training and Testing¶

This time we'll train the model with data through the end of October and then use the remaining data to test the model. This is where we also do the final trimming of the data to ensure it includes on whole intra-week periods.

In [427]:

# Values are begining of training, end of training, end of testing and
# represent the end of the respective intra-week order period.
date_range = dict(
    freq2_end=['2016-03-01', '2016-10-28', '2016-12-23'],
    freq3_end=['2016-03-03', '2016-10-29', '2016-12-24']
)

freq_avg = data.columns.tolist()[5]
freq = data.columns.tolist()[3]
beg_train, end_train, end_test = date_range[freq]

# Add constant
data['const'] = 1

# Normalize average sales variable.
mu = data[freq_avg].mean()
std = data[freq_avg].std()
data[freq_avg] = (data[freq_avg] - mu) / std

data_train = data[data[freq].ge(beg_train) & data[freq].le(end_train)]
data_test = data[data[freq].gt(end_train) & data[freq].le(end_test)]

X_train = data_train.iloc[:,5:].as_matrix()
y_train = data_train.iloc[:,0].as_matrix()

X_test = data_test.iloc[:,5:].as_matrix()
y_test = data_test.iloc[:,0].as_matrix()

Train Model¶

This is where we actually train the model. I ran it for 200 iterations - more won't likely increase the predictive power of the model, but there are some other diagnostics we can run to see what other improvements we can make.

In [428]:

from sklearn import linear_model
clf = linear_model.SGDRegressor(n_iter=100)
clf.fit(X_train, y_train)

Out[428]:

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=100, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

Evaluate Model¶

In [429]:

predict = clf.predict(X_test)
predict_neg = predict < 0
error = predict - y_test
error_neg = predict_neg @ error
np.savetxt('modelparams.csv', clf.predict(np.eye(X_test.shape[1])), delimiter=",")
print('R-squared: {:.{p}f}'.format(clf.score(X_test, y_test), p=4))
print('Total error in sales quantity: {:.{p}f}'.format(sum(error), p=0))
print('Total error as a % of actual: {:.{p}f}%'.format(sum(error) / sum(y_test)*100, p=2))
print('Total error in sales quantity with zero min prediction: {:.{p}f}'.format(sum(error)-error_neg, p=0))
print('Total error as a % of actual with zero min prediction: {:.{p}f}%'.format((sum(error)-error_neg) / sum(y_test)*100, p=2))

R-squared: 0.5402
Total error in sales quantity: 21932
Total error as a % of actual: 12.81%
Total error in sales quantity with zero min prediction: 23599
Total error as a % of actual with zero min prediction: 13.79%

This is still quite a good fit overall. As you would expect, it is not quite as accurate as the model that was based on seven day periods.

In [436]:

data_predict = data_test.iloc[:,:5]
data_predict[freq_avg] = data[freq_avg] * std + mu
data_predict['predict'] = predict
data_predict['variance'] = predict - data_predict['sales']
data_predict.head()

Out[436]:

	sales	store	item	freq3_end	freq3_end_avg	predict	variance
73276	1.0	SEGWD168	41780	2016-11-01	0.333333	0.645468	-0.354532
285476	0.0	SEGWD2366	42043	2016-11-01	1.000000	1.174602	1.174602
407040	0.0	SEGWD67	42040	2016-11-01	0.000000	0.247199	0.247199
222097	6.0	SEGWD2261	41795	2016-11-01	4.000000	4.417526	-1.582474
386161	0.0	SEGWD622	42045	2016-11-01	0.000000	0.149843	0.149843

In [437]:

py.iplot(data_predict.groupby(freq).sum()[['sales', 'predict']].iplot(asFigure=True,
                               kind='bar',xTitle='Dates',yTitle='Sales',title='Actual vs. Predicted'))

In [445]:

pivot = (pd.pivot_table(data_predict,index=['store','item'],columns=[freq],values=['sales', 'predict', 'variance'],aggfunc=np.sum,margins=True, margins_name='Total')
         .swaplevel(axis=1)
         .sortlevel(0, axis=1, sort_remaining=False)
        )
pivot

Out[445]:

	freq3_end	2016-11-01 00:00:00			2016-11-03 00:00:00			2016-11-05 00:00:00			2016-11-08 00:00:00	...	2016-12-20 00:00:00	2016-12-22 00:00:00			2016-12-24 00:00:00			Total
		sales	predict	variance	sales	predict	variance	sales	predict	variance	sales	...	variance	sales	predict	variance	sales	predict	variance	sales	predict	variance
store	item
SEGWD103	41774	3.0	1.496760	-1.503240	3.0	1.632384	-1.367616	0.0	1.656427	1.656427	3.0	...	0.698157	1.0	1.028195	0.028195	0.0	1.253634	1.253634	30.0	37.881773	7.881773
	41775	1.0	1.370742	0.370742	1.0	1.506365	0.506365	2.0	0.926219	-1.073781	2.0	...	0.370742	0.0	0.700780	0.700780	0.0	2.134597	2.134597	28.0	32.843361	4.843361
	41776	0.0	0.820901	0.820901	0.0	0.755128	0.755128	0.0	0.376379	0.376379	0.0	...	0.820901	1.0	0.352336	-0.647664	0.0	0.376379	0.376379	6.0	14.612282	8.612282
	41777	3.0	1.491449	-1.508551	0.0	0.821487	0.821487	4.0	1.248323	-2.751677	1.0	...	0.894242	1.0	0.821487	-0.178513	0.0	0.845530	0.845530	28.0	29.899845	1.899845
	41778	1.0	1.472789	0.472789	1.0	0.601431	-0.398569	1.0	0.826870	-0.173130	1.0	...	-1.527211	1.0	1.004224	0.004224	0.0	0.826870	0.826870	27.0	27.236655	0.236655
	41779	1.0	1.152855	0.152855	1.0	1.087082	0.087082	0.0	1.715314	1.715314	2.0	...	-1.041560	2.0	1.489874	-0.510126	0.0	1.111125	1.111125	35.0	34.260152	-0.739848
	41780	3.0	1.547540	-1.452460	2.0	0.877578	-1.122422	1.0	1.304414	0.304414	1.0	...	1.353125	1.0	0.676182	-0.323818	2.0	1.304414	-0.695586	35.0	32.857197	-2.142803
	41781	0.0	0.253061	0.253061	0.0	-0.014109	-0.014109	0.0	0.009934	0.009934	0.0	...	0.253061	0.0	-0.014109	-0.014109	0.0	0.009934	0.009934	0.0	1.991090	1.991090
	41782	1.0	2.069870	1.069870	1.0	1.399908	0.399908	1.0	1.625347	0.625347	4.0	...	-0.735715	0.0	1.198512	1.198512	3.0	1.625347	-1.374653	35.0	38.747044	3.747044
	41783	3.0	1.997093	-1.002907	2.0	1.125735	-0.874265	3.0	1.351174	-1.648826	3.0	...	-0.002907	0.0	1.528527	1.528527	0.0	1.753967	1.753967	45.0	42.035298	-2.964702
	41786	0.0	0.316015	0.316015	0.0	0.048845	0.048845	0.0	0.072888	0.072888	0.0	...	0.316015	0.0	0.048845	0.048845	0.0	0.072888	0.072888	0.0	3.501988	3.501988
	41788	3.0	1.492568	-1.507432	1.0	1.829588	0.829588	0.0	0.846649	0.846649	4.0	...	-1.104639	0.0	1.628191	1.628191	1.0	0.645253	-0.354747	33.0	36.975574	3.975574
	41789	2.0	1.118977	-0.881023	0.0	0.851807	0.851807	1.0	1.278643	0.278643	3.0	...	-0.276835	0.0	1.053203	1.053203	1.0	0.875850	-0.124150	29.0	30.023336	1.023336
	41790	2.0	0.999797	-1.000203	0.0	1.336817	1.336817	1.0	0.756671	-0.243329	2.0	...	1.201194	1.0	0.934024	-0.065976	0.0	0.555275	0.555275	21.0	24.947678	3.947678
	41791	3.0	3.222639	0.222639	5.0	2.552677	-2.447323	1.0	2.576720	1.576720	2.0	...	1.410073	1.0	1.545696	0.545696	0.0	1.368343	1.368343	28.0	54.933915	26.933915
	41792	10.0	4.952470	-5.047530	3.0	3.074130	0.074130	2.0	2.493984	0.493984	5.0	...	1.938507	0.0	1.462959	1.462959	0.0	1.889795	1.889795	37.0	69.664135	32.664135
	41793	10.0	4.662246	-5.337754	3.0	3.992284	0.992284	5.0	3.814931	-1.185069	10.0	...	-1.136358	5.0	3.186699	-1.813301	3.0	3.412138	0.412138	113.0	104.991987	-8.008013
	41794	6.0	3.695985	-2.304015	4.0	2.824627	-1.175373	5.0	2.244481	-2.755519	6.0	...	-1.699826	3.0	4.435797	1.435797	3.0	2.043085	-0.956915	94.0	86.031049	-7.968951
	41795	7.0	4.138919	-2.861081	3.0	3.267561	0.267561	9.0	3.895793	-5.104207	6.0	...	-2.249910	2.0	4.274543	2.274543	2.0	3.895793	1.895793	115.0	108.543855	-6.456145
	41796	0.0	2.333422	2.333422	4.0	2.267649	-1.732351	5.0	2.090296	-2.909704	6.0	...	-1.256804	1.0	2.871838	1.871838	3.0	1.687503	-1.312497	71.0	57.558860	-13.441140
	41797	0.0	2.085702	2.085702	0.0	2.019929	2.019929	5.0	2.043972	-2.956028	1.0	...	-1.907316	0.0	2.019929	2.019929	3.0	1.439783	-1.560217	46.0	45.168899	-0.831101
	41798	3.0	3.315367	0.315367	2.0	2.041216	0.041216	5.0	1.863863	-3.136137	3.0	...	0.711178	4.0	2.444008	-1.555992	3.0	2.065259	-0.934741	70.0	59.576130	-10.423870
	41799	2.0	1.360804	-0.639196	3.0	0.690842	-2.309158	1.0	0.714885	-0.285115	1.0	...	-0.437799	1.0	0.892239	-0.107761	1.0	1.319074	0.319074	33.0	28.576936	-4.423064
	41800	0.0	0.170117	0.170117	0.0	-0.097052	-0.097052	0.0	-0.073009	-0.073009	0.0	...	0.170117	0.0	-0.097052	-0.097052	0.0	-0.073009	-0.073009	0.0	0.000452	0.000452
	41803	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	41804	0.0	0.110489	0.110489	0.0	-0.156681	-0.156681	0.0	-0.132638	-0.132638	0.0	...	0.110489	0.0	-0.156681	-0.156681	0.0	-0.132638	-0.132638	0.0	-1.430642	-1.430642
	42040	0.0	0.776137	0.776137	0.0	0.508968	0.508968	0.0	0.130218	0.130218	0.0	...	0.373345	0.0	0.106175	0.106175	0.0	0.130218	0.130218	0.0	6.489080	6.489080
	42041	0.0	0.817053	0.817053	0.0	0.751280	0.751280	0.0	0.372530	0.372530	0.0	...	0.414260	0.0	0.147091	0.147091	0.0	0.171134	0.171134	0.0	8.478029	8.478029
	42042	0.0	0.828125	0.828125	0.0	0.359559	0.359559	0.0	0.584999	0.584999	0.0	...	0.425332	0.0	0.158163	0.158163	0.0	0.182206	0.182206	0.0	8.139574	8.139574
	42043	0.0	0.740928	0.740928	0.0	0.272362	0.272362	0.0	0.296405	0.296405	0.0	...	0.338135	0.0	0.070966	0.070966	0.0	0.095009	0.095009	0.0	5.845440	5.845440
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
SEGWD97	41786	0.0	0.108289	0.108289	0.0	-0.158880	-0.158880	0.0	-0.134837	-0.134837	0.0	...	0.108289	0.0	-0.158880	-0.158880	0.0	-0.134837	-0.134837	0.0	-1.483428	-1.483428
	41788	2.0	1.083446	-0.916554	2.0	1.017673	-0.982327	1.0	1.243112	0.243112	1.0	...	-0.312365	1.0	0.816277	-0.183723	1.0	0.638924	-0.361076	27.0	28.163628	1.163628
	41789	2.0	0.508458	-1.491542	0.0	1.852459	1.852459	3.0	1.473710	-1.526290	0.0	...	-0.484560	1.0	0.845478	-0.154522	1.0	0.265332	-0.734668	25.0	23.426750	-1.573250
	41790	1.0	0.792072	-0.207928	0.0	0.524902	0.524902	4.0	0.548945	-3.451055	0.0	...	0.590675	1.0	0.323506	-0.676494	1.0	0.548945	-0.451055	16.0	15.732939	-0.267061
	41791	4.0	2.612121	-1.387879	5.0	2.546348	-2.453652	0.0	2.167598	2.167598	7.0	...	-1.186483	2.0	2.344952	0.344952	3.0	2.973184	-0.026816	70.0	66.060203	-3.939797
	41792	3.0	2.730781	-0.269219	4.0	2.463612	-1.536388	4.0	3.494636	-0.505364	3.0	...	-1.665030	3.0	3.067800	0.067800	3.0	2.286258	-0.713742	68.0	72.533175	4.533175
	41793	4.0	2.037765	-1.962235	2.0	2.576180	0.576180	4.0	2.801620	-1.198380	2.0	...	-0.760839	3.0	3.180369	0.180369	2.0	2.398827	0.398827	61.0	66.373388	5.373388
	41794	4.0	2.682674	-1.317326	3.0	2.415505	-0.584495	5.0	2.036755	-2.963245	5.0	...	-2.122911	2.0	3.422486	1.422486	3.0	3.043737	0.043737	75.0	69.767440	-5.232560
	41795	3.0	3.931194	0.931194	3.0	2.657043	-0.342957	3.0	2.681086	-0.318914	4.0	...	-0.471599	3.0	4.066817	1.066817	2.0	3.083878	1.083878	77.0	83.016016	6.016016
	41796	2.0	1.722904	-0.277096	1.0	1.052942	0.052942	4.0	1.076985	-2.923015	2.0	...	-0.478492	2.0	0.650149	-1.349851	0.0	1.681174	1.681174	34.0	33.843588	-0.156412
	41797	2.0	1.676580	-0.323420	2.0	1.208014	-0.791986	4.0	0.829265	-3.170735	3.0	...	-0.524816	0.0	0.603825	0.603825	2.0	1.030661	-0.969339	35.0	33.738801	-1.261199
	41798	0.0	1.295074	1.295074	1.0	1.430698	0.430698	1.0	1.051948	0.051948	3.0	...	-0.906322	1.0	0.423716	-0.576284	2.0	1.454741	-0.545259	28.0	30.020364	2.020364
	41799	0.0	0.347493	0.347493	0.0	0.080324	0.080324	0.0	0.104367	0.104367	0.0	...	0.750286	0.0	0.080324	0.080324	0.0	0.305763	0.305763	4.0	5.868645	1.868645
	41800	0.0	-0.037608	-0.037608	0.0	-0.304778	-0.304778	0.0	-0.280735	-0.280735	0.0	...	-0.037608	0.0	-0.304778	-0.304778	0.0	-0.280735	-0.280735	0.0	-4.984964	-4.984964
	41803	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	41804	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
	42040	0.0	0.165619	0.165619	0.0	-0.101550	-0.101550	0.0	0.123889	0.123889	0.0	...	0.165619	0.0	-0.101550	-0.101550	0.0	-0.077507	-0.077507	0.0	0.093890	0.093890
	42041	0.0	0.206535	0.206535	0.0	-0.060635	-0.060635	0.0	-0.036592	-0.036592	0.0	...	0.206535	0.0	-0.060635	-0.060635	0.0	-0.036592	-0.036592	0.0	0.874461	0.874461
	42042	0.0	0.217607	0.217607	0.0	-0.049563	-0.049563	0.0	0.175877	0.175877	0.0	...	0.217607	0.0	-0.049563	-0.049563	0.0	-0.025520	-0.025520	0.0	1.341591	1.341591
	42043	0.0	0.331806	0.331806	0.0	-0.136760	-0.136760	0.0	-0.112717	-0.112717	0.0	...	0.130409	0.0	-0.136760	-0.136760	0.0	-0.112717	-0.112717	0.0	-0.549751	-0.549751
	42044	0.0	0.529452	0.529452	0.0	-0.140510	-0.140510	0.0	-0.116467	-0.116467	0.0	...	0.126660	0.0	-0.140510	-0.140510	0.0	-0.116467	-0.116467	0.0	-0.236952	-0.236952
	42045	0.0	0.153535	0.153535	0.0	-0.113635	-0.113635	0.0	0.111804	0.111804	0.0	...	0.153535	0.0	-0.113635	-0.113635	0.0	-0.089592	-0.089592	0.0	-0.196142	-0.196142
	42046	0.0	0.073952	0.073952	0.0	-0.193217	-0.193217	0.0	-0.169174	-0.169174	0.0	...	0.073952	0.0	-0.193217	-0.193217	0.0	-0.169174	-0.169174	0.0	-2.307511	-2.307511
	42047	1.0	1.247721	0.247721	0.0	0.577759	0.577759	5.0	0.601802	-4.398198	1.0	...	-0.752279	0.0	0.577759	0.577759	1.0	1.407388	0.407388	35.0	27.071326	-7.928674
	42048	1.0	1.257838	0.257838	1.0	0.386480	-0.613520	2.0	1.216108	-0.783892	2.0	...	0.063423	2.0	0.587876	-1.412124	1.0	1.014712	0.014712	38.0	28.321106	-9.678894
	42049	2.0	1.131052	-0.868948	1.0	0.259693	-0.740307	4.0	1.290718	-2.709282	1.0	...	0.131052	2.0	1.468071	-0.531929	2.0	0.485133	-1.514867	42.0	28.500577	-13.499423
	42050	2.0	1.472176	-0.527824	0.0	0.399422	0.399422	2.0	0.826257	-1.173743	4.0	...	0.069384	2.0	0.802214	-1.197786	2.0	1.027653	-0.972347	31.0	24.603784	-6.396216
	42051	0.0	1.289408	1.289408	1.0	1.022239	0.022239	3.0	1.046282	-1.953718	1.0	...	0.094993	1.0	0.619446	-0.380554	1.0	0.643489	-0.356511	29.0	25.655051	-3.344949
	42052	2.0	1.079798	-0.920202	1.0	0.409836	-0.590164	4.0	0.635276	-3.364724	1.0	...	1.281195	0.0	0.208440	0.208440	4.0	0.232483	-3.767517	29.0	19.416036	-9.583964
Total		8733.0	9304.347387	571.347387	6118.0	6650.350036	532.350036	10153.0	7259.384284	-2893.615716	9235.0	...	1261.308616	5625.0	6810.836373	1185.836373	6752.0	7032.609569	280.609569	171148.0	193080.171850	21932.171850

7060 rows × 75 columns

In [446]:

timestring = datetime.datetime.fromtimestamp(time.time()).strftime('%m%d%H%M')
pivot.to_csv('pivot_' + freq[:5] + '_' + timestring + '.csv')

Improvement Opportunities¶

Run model diagnostics to understand bias and variance.
Analyze the errors (cut by store, by item, by period to try to identify areas where the errors are systematically larger. Explore segmenting or grouping stores and/or products.
Longer history of sales going back at least another year to be able to factor in normal seasonality.
Evaluation of alternative optimization algorighthms beyond regression.
Incorporate pricing and promotional variables.