from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go
from plotly import tools
df = pd.read_csv('sales.csv')
df.OrderDate = pd.to_datetime(df.OrderDate)
df.columns = ['date', 'store', 'item', 'sales']
df.date = pd.to_datetime(df.date)
df.item = df.item.astype('category')
df.store = df.store.astype('category')
df.dtypes
date datetime64[ns] store category item category sales int64 dtype: object
Below are some summary statistics on the data. Overall the quantities for individual items at individual stores is quite small. It would be difficult to forecast daily quantities of individual items and individual stores, so we will work towards forecasting weekly item sales at individual stores.
df.describe(include='all')
date | store | item | sales | |
---|---|---|---|---|
count | 350796 | 350796 | 350796.0 | 350796.000000 |
unique | 301 | 181 | 39.0 | NaN |
top | 2016-11-05 00:00:00 | SEGWD7 | 41795.0 | NaN |
freq | 4110 | 5353 | 22282.0 | NaN |
first | 2016-02-04 00:00:00 | NaN | NaN | NaN |
last | 2016-11-30 00:00:00 | NaN | NaN | NaN |
mean | NaN | NaN | NaN | 1.681211 |
std | NaN | NaN | NaN | 1.134632 |
min | NaN | NaN | NaN | 1.000000 |
25% | NaN | NaN | NaN | 1.000000 |
50% | NaN | NaN | NaN | 1.000000 |
75% | NaN | NaN | NaN | 2.000000 |
max | NaN | NaN | NaN | 112.000000 |
Let's take a look at a few cuts on the data to see if we can spot any trends. Below are plots of a handful of individual stores' sales. It looks like the answer to the question about the big jump in sales in September is the addition of a good number of stores.
Here is the total sales graph we took a look at yesterday.
df_total = df.groupby(pd.Grouper(freq='W', key='date')).sum().fillna(0).unstack('date', 0)
df_total.index.levels[1]
len(df_total) == len(df_total.index.levels[1])
trace = go.Scatter(
x = df_total.index.levels[1],
y = df_total
)
layout = go.Layout(
title='Total Sales'
)
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='total-sales')
We may want to look at this data to figure out if we need to limit that date range to be from March to November in order to avoid any partial weeks. But that shouldn't be too much of an issue because I'm going to set the training data up to make predictions of sales of items at individual stores, so as long as the drop offs on either end are the result of fewer stores being included in the data, that shouldn't have a negative impact on the model.
df_1w = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)
rows = 10
cols = 3
spidx = np.arange(rows*cols).reshape(rows,cols)
fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])
for i in range(rows):
for j in range(cols):
trace = go.Scatter(c
x = df_1w.iloc[1].index.levels[1],
y = df_1w.iloc[spidx[i,j]],
)
fig.append_trace(trace, i+1, j+1)
fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (1,2) x2,y1 ] [ (1,3) x3,y1 ] [ (2,1) x4,y2 ] [ (2,2) x5,y2 ] [ (2,3) x6,y2 ] [ (3,1) x7,y3 ] [ (3,2) x8,y3 ] [ (3,3) x9,y3 ] [ (4,1) x10,y4 ] [ (4,2) x11,y4 ] [ (4,3) x12,y4 ] [ (5,1) x13,y5 ] [ (5,2) x14,y5 ] [ (5,3) x15,y5 ] [ (6,1) x16,y6 ] [ (6,2) x17,y6 ] [ (6,3) x18,y6 ] [ (7,1) x19,y7 ] [ (7,2) x20,y7 ] [ (7,3) x21,y7 ] [ (8,1) x22,y8 ] [ (8,2) x23,y8 ] [ (8,3) x24,y8 ] [ (9,1) x25,y9 ] [ (9,2) x26,y9 ] [ (9,3) x27,y9 ] [ (10,1) x28,y10 ] [ (10,2) x29,y10 ] [ (10,3) x30,y10 ]
So let's do a plot of the count of the number of stores with sales in each week. This confirms that there were a lot of new stores added in September.
store_sales = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date')
stores_with_sales = store_sales['sales'].where(store_sales.sales > 0).count()
stores_with_sales.index
trace = go.Bar(
x = stores_with_sales.index,
y = stores_with_sales
)
layout = go.Layout(
title='No. of Stores with Sales'
)
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='stores-with-sales')
Let's take a look sales for the individual items.
df_1w = df.groupby(['item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)
rows = 13
cols = 3
fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])
spidx = np.arange(rows*cols).reshape(rows,cols)
for i in range(rows):
for j in range(cols):
trace = go.Scatter(
x = df_1w.iloc[1].index.levels[1],
y = df_1w.iloc[spidx[i,j]],
)
fig.append_trace(trace, i+1, j+1)
fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (1,2) x2,y1 ] [ (1,3) x3,y1 ] [ (2,1) x4,y2 ] [ (2,2) x5,y2 ] [ (2,3) x6,y2 ] [ (3,1) x7,y3 ] [ (3,2) x8,y3 ] [ (3,3) x9,y3 ] [ (4,1) x10,y4 ] [ (4,2) x11,y4 ] [ (4,3) x12,y4 ] [ (5,1) x13,y5 ] [ (5,2) x14,y5 ] [ (5,3) x15,y5 ] [ (6,1) x16,y6 ] [ (6,2) x17,y6 ] [ (6,3) x18,y6 ] [ (7,1) x19,y7 ] [ (7,2) x20,y7 ] [ (7,3) x21,y7 ] [ (8,1) x22,y8 ] [ (8,2) x23,y8 ] [ (8,3) x24,y8 ] [ (9,1) x25,y9 ] [ (9,2) x26,y9 ] [ (9,3) x27,y9 ] [ (10,1) x28,y10 ] [ (10,2) x29,y10 ] [ (10,3) x30,y10 ] [ (11,1) x31,y11 ] [ (11,2) x32,y11 ] [ (11,3) x33,y11 ] [ (12,1) x34,y12 ] [ (12,2) x35,y12 ] [ (12,3) x36,y12 ] [ (13,1) x37,y13 ] [ (13,2) x38,y13 ] [ (13,3) x39,y13 ]
It also looks like there were some big increases in sales of individual items. It would be interesting to do some more analysis to figure out if those items were in in the new stores that came on.
Now for the actual model. The first thing we'll do is add in a rolling average of the prior three weeks' sales.
df_model = df.groupby(['store', 'item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0)
rolling_sum = df_model.apply(lambda x:x.rolling(window=3).mean())
rolling_sum.shift(-1)
df_model['sales_avg'] = rolling_sum.shift(-1)['sales']
df_model.head(10)
sales | sales_avg | |||
---|---|---|---|---|
store | item | date | ||
SEGWD103 | 41774 | 2016-02-07 | 1.0 | NaN |
2016-02-14 | 3.0 | 2.333333 | ||
2016-02-21 | 3.0 | 2.666667 | ||
2016-02-28 | 2.0 | 2.666667 | ||
2016-03-06 | 3.0 | 3.333333 | ||
2016-03-13 | 5.0 | 4.666667 | ||
2016-03-20 | 6.0 | 5.666667 | ||
2016-03-27 | 6.0 | 5.333333 | ||
2016-04-03 | 4.0 | 5.333333 | ||
2016-04-10 | 6.0 | 5.000000 |
df_model.loc['SEGWD7'].to_csv('SEGWD7.csv')
df_model['cum_sales'] = df_model.groupby(level=[0,1]).cumsum()['sales']
df_model.reset_index(inplace=True)
df_model.describe(include='all')
store | item | date | sales | sales_avg | cum_sales | |
---|---|---|---|---|---|---|
count | 310596 | 310596.0 | 310596 | 310596.000000 | 310594.000000 | 310596.000000 |
unique | 181 | 39.0 | 44 | NaN | NaN | NaN |
top | SEGWD97 | 42052.0 | 2016-07-10 00:00:00 | NaN | NaN | NaN |
freq | 1716 | 7964.0 | 7059 | NaN | NaN | NaN |
first | NaN | NaN | 2016-02-07 00:00:00 | NaN | NaN | NaN |
last | NaN | NaN | 2016-12-04 00:00:00 | NaN | NaN | NaN |
mean | NaN | NaN | NaN | 1.898807 | 1.898811 | 36.002048 |
std | NaN | NaN | NaN | 3.853454 | 3.667099 | 88.267833 |
min | NaN | NaN | NaN | 0.000000 | 0.000000 | 0.000000 |
25% | NaN | NaN | NaN | 0.000000 | 0.000000 | 0.000000 |
50% | NaN | NaN | NaN | 0.000000 | 0.000000 | 0.000000 |
75% | NaN | NaN | NaN | 2.000000 | 2.666667 | 24.000000 |
max | NaN | NaN | NaN | 118.000000 | 86.000000 | 1977.000000 |
df_model_masked = df_model[df_model.cum_sales != 0]
df_model_masked.describe(include='all')
store | item | date | sales | sales_avg | cum_sales | |
---|---|---|---|---|---|---|
count | 144822 | 144822.0 | 144822 | 144822.000000 | 144820.000000 | 144822.000000 |
unique | 181 | 39.0 | 44 | NaN | NaN | NaN |
top | SEGWD7 | 41793.0 | 2016-11-27 00:00:00 | NaN | NaN | NaN |
freq | 1336 | 4759.0 | 6424 | NaN | NaN | NaN |
first | NaN | NaN | 2016-02-07 00:00:00 | NaN | NaN | NaN |
last | NaN | NaN | 2016-12-04 00:00:00 | NaN | NaN | NaN |
mean | NaN | NaN | NaN | 4.072323 | 4.030594 | 77.212661 |
std | NaN | NaN | NaN | 4.795340 | 4.501835 | 116.308495 |
min | NaN | NaN | NaN | 0.000000 | 0.000000 | 1.000000 |
25% | NaN | NaN | NaN | 0.000000 | 0.666667 | 9.000000 |
50% | NaN | NaN | NaN | 3.000000 | 2.666667 | 29.000000 |
75% | NaN | NaN | NaN | 6.000000 | 5.666667 | 98.000000 |
max | NaN | NaN | NaN | 118.000000 | 86.000000 | 1977.000000 |
It looks like that eliminated quite a few observations, over half of them. Now we can encode the store and item variables as binary classifications and then we can use the data to train a model.
stores = pd.get_dummies(df_model_masked['store'])
items = pd.get_dummies(df_model_masked['item'])
df_final = pd.concat([df_model_masked, stores, items], axis=1).dropna(how='any')
df_final.drop(['date', 'cum_sales', 'store', 'item', 41793, 'SEGWD103'], axis=1, inplace=True)
# df_final.to_csv('modeldata.csv')
df_final.head()
sales | sales_avg | SEGWD104 | SEGWD116 | SEGWD12 | SEGWD123 | SEGWD125 | SEGWD129 | SEGWD135 | SEGWD138 | ... | 42043 | 42044 | 42045 | 42046 | 42047 | 42048 | 42049 | 42050 | 42051 | 42052 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 3.0 | 2.333333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 3.0 | 2.666667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 2.0 | 2.666667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 3.0 | 3.333333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 5.0 | 4.666667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 220 columns
df_final.describe()
sales | sales_avg | SEGWD104 | SEGWD116 | SEGWD12 | SEGWD123 | SEGWD125 | SEGWD129 | SEGWD135 | SEGWD138 | ... | 42043 | 42044 | 42045 | 42046 | 42047 | 42048 | 42049 | 42050 | 42051 | 42052 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | ... | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 | 144820.000000 |
mean | 4.072373 | 4.030594 | 0.003349 | 0.003356 | 0.009094 | 0.009142 | 0.003370 | 0.009198 | 0.009177 | 0.009142 | ... | 0.016275 | 0.016013 | 0.016220 | 0.015385 | 0.016641 | 0.016545 | 0.016275 | 0.016655 | 0.016614 | 0.016344 |
std | 4.795355 | 4.501835 | 0.057774 | 0.057833 | 0.094928 | 0.095178 | 0.057951 | 0.095463 | 0.095356 | 0.095178 | ... | 0.126533 | 0.125526 | 0.126322 | 0.123077 | 0.127924 | 0.127558 | 0.126533 | 0.127976 | 0.127819 | 0.126797 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 0.666667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 3.000000 | 2.666667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 6.000000 | 5.666667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 118.000000 | 86.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 220 columns
data = df_final.dropna().as_matrix()
data.shape
(144820, 220)
We'll save 30% of the data to test our model.
np.random.shuffle(data)
split = int(0.70*data.shape[0])
X_train = np.ones(data[:split,:].shape)
X_test = np.ones(data[split:,:].shape)
X_train[:,1:] = data[:split,1:]
y_train = data[:split,0]
X_test[:,1:] = data[split:,1:]
y_test = data[split:,0]
This is where we actually train the model. I ran it for 200 iterations - more won't likely increase the predictive power of the model, but there are some other diagnostics we can run to see what other improvements we can make.
from sklearn import linear_model
clf = linear_model.SGDRegressor(n_iter=200)
clf.fit(X_train, y_train)
SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', n_iter=200, penalty='l2', power_t=0.25, random_state=None, shuffle=True, verbose=0, warm_start=False)
predict = clf.predict(X_test)
predict_neg = predict < 0
error = y_test - predict
error_neg = predict_neg @ error
np.savetxt('modelparams.csv', clf.predict(np.eye(X_test.shape[1])), delimiter=",")
print('R-squared: {:.{p}f}'.format(clf.score(X_test, y_test), p=4))
print('Total error in sales quantity: {:.{p}f}'.format(sum(error), p=0))
print('Total error as a % of actual: {:.{p}f}%'.format(sum(error) / sum(y_test)*100, p=2))
print('Total error in sales quantity with zero min prediction: {:.{p}f}'.format(error_neg, p=0))
print('Total error as a % of actual with zero min prediction: {:.{p}f}%'.format((sum(error)+error_neg) / sum(y_test)*100, p=2))
R-squared: 0.8886 Total error in sales quantity: -7424 Total error as a % of actual: -4.20% Total error in sales quantity with zero min prediction: 14 Total error as a % of actual with zero min prediction: -4.19%
An overall r-square of 85% is pretty good, I think, for really just dealing with three variables - store and item numbers and historical sales - and the overall error is very low on such a large number of test observations.
predicted = go.Bar(
name = 'predicted',
y = clf.predict(X_test)
)
actual = go.Bar(
name = 'actual',
y = y_test
)
layout = go.Layout(
title='Actual vs. Predicted'
)
fig = go.Figure(data=[actual, predicted], layout=layout)
py.iplot(fig, filename='actual-vs-predicted')
Overall this says that this is a pretty good model. The total error over 40,000 individual store-item observations is really low. The graph above shows that we are missing on the big spikes, but overall the performance is good.
df_final.loc[:100].to_csv('samplemodeldata.csv')
np.eye(X_test.shape[1]) * clf.predict(X_test)