In [22]:

from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go
from plotly import tools

Import and format data¶

In [23]:

df = pd.read_csv('sales.csv')
df.OrderDate = pd.to_datetime(df.OrderDate)
df.columns = ['date', 'store', 'item', 'sales']
df.date = pd.to_datetime(df.date)
df.item = df.item.astype('category')
df.store = df.store.astype('category')
df.dtypes

Out[23]:

date     datetime64[ns]
store          category
item           category
sales             int64
dtype: object

Below are some summary statistics on the data. Overall the quantities for individual items at individual stores is quite small. It would be difficult to forecast daily quantities of individual items and individual stores, so we will work towards forecasting weekly item sales at individual stores.

In [24]:

df.describe(include='all')

Out[24]:

	date	store	item	sales
count	350796	350796	350796.0	350796.000000
unique	301	181	39.0	NaN
top	2016-11-05 00:00:00	SEGWD7	41795.0	NaN
freq	4110	5353	22282.0	NaN
first	2016-02-04 00:00:00	NaN	NaN	NaN
last	2016-11-30 00:00:00	NaN	NaN	NaN
mean	NaN	NaN	NaN	1.681211
std	NaN	NaN	NaN	1.134632
min	NaN	NaN	NaN	1.000000
25%	NaN	NaN	NaN	1.000000
50%	NaN	NaN	NaN	1.000000
75%	NaN	NaN	NaN	2.000000
max	NaN	NaN	NaN	112.000000

Analyze Data¶

Let's take a look at a few cuts on the data to see if we can spot any trends. Below are plots of a handful of individual stores' sales. It looks like the answer to the question about the big jump in sales in September is the addition of a good number of stores.

Total Sales¶

Here is the total sales graph we took a look at yesterday.

In [25]:

df_total = df.groupby(pd.Grouper(freq='W', key='date')).sum().fillna(0).unstack('date', 0)
df_total.index.levels[1]

len(df_total) == len(df_total.index.levels[1])

trace = go.Scatter(
    x = df_total.index.levels[1],
    y = df_total
)

layout = go.Layout(
    title='Total Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='total-sales')

We may want to look at this data to figure out if we need to limit that date range to be from March to November in order to avoid any partial weeks. But that shouldn't be too much of an issue because I'm going to set the training data up to make predictions of sales of items at individual stores, so as long as the drop offs on either end are the result of fewer stores being included in the data, that shouldn't have a negative impact on the model.

Individual Store Sales¶

In [26]:

df_1w = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)

rows = 10
cols = 3
spidx = np.arange(rows*cols).reshape(rows,cols)

fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])

for i in range(rows):

    for j in range(cols):

        trace = go.Scatter(c
            x = df_1w.iloc[1].index.levels[1],
            y = df_1w.iloc[spidx[i,j]],
        )

        fig.append_trace(trace, i+1, j+1)

fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y1 ]     [ (1,3) x3,y1 ]   
[ (2,1) x4,y2 ]     [ (2,2) x5,y2 ]     [ (2,3) x6,y2 ]   
[ (3,1) x7,y3 ]     [ (3,2) x8,y3 ]     [ (3,3) x9,y3 ]   
[ (4,1) x10,y4 ]    [ (4,2) x11,y4 ]    [ (4,3) x12,y4 ]  
[ (5,1) x13,y5 ]    [ (5,2) x14,y5 ]    [ (5,3) x15,y5 ]  
[ (6,1) x16,y6 ]    [ (6,2) x17,y6 ]    [ (6,3) x18,y6 ]  
[ (7,1) x19,y7 ]    [ (7,2) x20,y7 ]    [ (7,3) x21,y7 ]  
[ (8,1) x22,y8 ]    [ (8,2) x23,y8 ]    [ (8,3) x24,y8 ]  
[ (9,1) x25,y9 ]    [ (9,2) x26,y9 ]    [ (9,3) x27,y9 ]  
[ (10,1) x28,y10 ]  [ (10,2) x29,y10 ]  [ (10,3) x30,y10 ]

New Stores¶

So let's do a plot of the count of the number of stores with sales in each week. This confirms that there were a lot of new stores added in September.

In [27]:

store_sales = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date')
stores_with_sales = store_sales['sales'].where(store_sales.sales > 0).count()

stores_with_sales.index

trace = go.Bar(
    x = stores_with_sales.index,
    y = stores_with_sales
)

layout = go.Layout(
    title='No. of Stores with Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='stores-with-sales')

Items¶

Let's take a look sales for the individual items.

In [28]:

df_1w = df.groupby(['item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)
rows = 13
cols = 3

fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])

spidx = np.arange(rows*cols).reshape(rows,cols)

for i in range(rows):

    for j in range(cols):

        trace = go.Scatter(
            x = df_1w.iloc[1].index.levels[1],
            y = df_1w.iloc[spidx[i,j]],
        )

        fig.append_trace(trace, i+1, j+1)

fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y1 ]     [ (1,3) x3,y1 ]   
[ (2,1) x4,y2 ]     [ (2,2) x5,y2 ]     [ (2,3) x6,y2 ]   
[ (3,1) x7,y3 ]     [ (3,2) x8,y3 ]     [ (3,3) x9,y3 ]   
[ (4,1) x10,y4 ]    [ (4,2) x11,y4 ]    [ (4,3) x12,y4 ]  
[ (5,1) x13,y5 ]    [ (5,2) x14,y5 ]    [ (5,3) x15,y5 ]  
[ (6,1) x16,y6 ]    [ (6,2) x17,y6 ]    [ (6,3) x18,y6 ]  
[ (7,1) x19,y7 ]    [ (7,2) x20,y7 ]    [ (7,3) x21,y7 ]  
[ (8,1) x22,y8 ]    [ (8,2) x23,y8 ]    [ (8,3) x24,y8 ]  
[ (9,1) x25,y9 ]    [ (9,2) x26,y9 ]    [ (9,3) x27,y9 ]  
[ (10,1) x28,y10 ]  [ (10,2) x29,y10 ]  [ (10,3) x30,y10 ]
[ (11,1) x31,y11 ]  [ (11,2) x32,y11 ]  [ (11,3) x33,y11 ]
[ (12,1) x34,y12 ]  [ (12,2) x35,y12 ]  [ (12,3) x36,y12 ]
[ (13,1) x37,y13 ]  [ (13,2) x38,y13 ]  [ (13,3) x39,y13 ]

It also looks like there were some big increases in sales of individual items. It would be interesting to do some more analysis to figure out if those items were in in the new stores that came on.

Prepare Data¶

Add Trailing Average Sales¶

Now for the actual model. The first thing we'll do is add in a rolling average of the prior three weeks' sales.

In [96]:

df_model = df.groupby(['store', 'item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0)
rolling_sum = df_model.apply(lambda x:x.rolling(window=3).mean())
rolling_sum.shift(-1)
df_model['sales_avg'] = rolling_sum.shift(-1)['sales']
df_model.head(10)

Out[96]:

			sales	sales_avg
store	item	date
SEGWD103	41774	2016-02-07	1.0	NaN
		2016-02-14	3.0	2.333333
		2016-02-21	3.0	2.666667
		2016-02-28	2.0	2.666667
		2016-03-06	3.0	3.333333
		2016-03-13	5.0	4.666667
		2016-03-20	6.0	5.666667
		2016-03-27	6.0	5.333333
		2016-04-03	4.0	5.333333
		2016-04-10	6.0	5.000000

In [100]:

df_model.loc['SEGWD7'].to_csv('SEGWD7.csv')

Remove Missing Stores and Items¶

In [30]:

df_model['cum_sales'] = df_model.groupby(level=[0,1]).cumsum()['sales']
df_model.reset_index(inplace=True)
df_model.describe(include='all')

Out[30]:

	store	item	date	sales	sales_avg	cum_sales
count	310596	310596.0	310596	310596.000000	310594.000000	310596.000000
unique	181	39.0	44	NaN	NaN	NaN
top	SEGWD97	42052.0	2016-07-10 00:00:00	NaN	NaN	NaN
freq	1716	7964.0	7059	NaN	NaN	NaN
first	NaN	NaN	2016-02-07 00:00:00	NaN	NaN	NaN
last	NaN	NaN	2016-12-04 00:00:00	NaN	NaN	NaN
mean	NaN	NaN	NaN	1.898807	1.898811	36.002048
std	NaN	NaN	NaN	3.853454	3.667099	88.267833
min	NaN	NaN	NaN	0.000000	0.000000	0.000000
25%	NaN	NaN	NaN	0.000000	0.000000	0.000000
50%	NaN	NaN	NaN	0.000000	0.000000	0.000000
75%	NaN	NaN	NaN	2.000000	2.666667	24.000000
max	NaN	NaN	NaN	118.000000	86.000000	1977.000000

In [31]:

df_model_masked = df_model[df_model.cum_sales != 0]
df_model_masked.describe(include='all')

Out[31]:

	store	item	date	sales	sales_avg	cum_sales
count	144822	144822.0	144822	144822.000000	144820.000000	144822.000000
unique	181	39.0	44	NaN	NaN	NaN
top	SEGWD7	41793.0	2016-11-27 00:00:00	NaN	NaN	NaN
freq	1336	4759.0	6424	NaN	NaN	NaN
first	NaN	NaN	2016-02-07 00:00:00	NaN	NaN	NaN
last	NaN	NaN	2016-12-04 00:00:00	NaN	NaN	NaN
mean	NaN	NaN	NaN	4.072323	4.030594	77.212661
std	NaN	NaN	NaN	4.795340	4.501835	116.308495
min	NaN	NaN	NaN	0.000000	0.000000	1.000000
25%	NaN	NaN	NaN	0.000000	0.666667	9.000000
50%	NaN	NaN	NaN	3.000000	2.666667	29.000000
75%	NaN	NaN	NaN	6.000000	5.666667	98.000000
max	NaN	NaN	NaN	118.000000	86.000000	1977.000000

Encode Store and Item Categories¶

It looks like that eliminated quite a few observations, over half of them. Now we can encode the store and item variables as binary classifications and then we can use the data to train a model.

In [79]:

stores = pd.get_dummies(df_model_masked['store'])
items = pd.get_dummies(df_model_masked['item'])
df_final = pd.concat([df_model_masked, stores, items], axis=1).dropna(how='any')
df_final.drop(['date', 'cum_sales', 'store', 'item', 41793, 'SEGWD103'], axis=1, inplace=True)
# df_final.to_csv('modeldata.csv')

In [80]:

df_final.head()

Out[80]:

	sales	sales_avg	...
1	3.0	2.333333	...
2	3.0	2.666667	...
3	2.0	2.666667	...
4	3.0	3.333333	...
5	5.0	4.666667	...

5 rows × 220 columns

In [81]:

df_final.describe()

Out[81]:

	sales	sales_avg	SEGWD104	SEGWD116	SEGWD12	SEGWD123	SEGWD125	SEGWD129	SEGWD135	SEGWD138	...	42043	42044	42045	42046	42047	42048	42049	42050	42051	42052
count	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	...	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000	144820.000000
mean	4.072373	4.030594	0.003349	0.003356	0.009094	0.009142	0.003370	0.009198	0.009177	0.009142	...	0.016275	0.016013	0.016220	0.015385	0.016641	0.016545	0.016275	0.016655	0.016614	0.016344
std	4.795355	4.501835	0.057774	0.057833	0.094928	0.095178	0.057951	0.095463	0.095356	0.095178	...	0.126533	0.125526	0.126322	0.123077	0.127924	0.127558	0.126533	0.127976	0.127819	0.126797
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.666667	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	3.000000	2.666667	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	6.000000	5.666667	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	118.000000	86.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

8 rows × 220 columns

In [83]:

data = df_final.dropna().as_matrix()
data.shape

Out[83]:

(144820, 220)

Split Train and Test Data¶

We'll save 30% of the data to test our model.

In [84]:

np.random.shuffle(data)
split = int(0.70*data.shape[0])
X_train = np.ones(data[:split,:].shape)
X_test = np.ones(data[split:,:].shape)

X_train[:,1:] = data[:split,1:]
y_train = data[:split,0]

X_test[:,1:] = data[split:,1:]
y_test = data[split:,0]

Train Model¶

This is where we actually train the model. I ran it for 200 iterations - more won't likely increase the predictive power of the model, but there are some other diagnostics we can run to see what other improvements we can make.

In [85]:

from sklearn import linear_model
clf = linear_model.SGDRegressor(n_iter=200)
clf.fit(X_train, y_train)

Out[85]:

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=200, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

Evaluate Model¶

In [86]:

predict = clf.predict(X_test)
predict_neg = predict < 0
error = y_test - predict
error_neg = predict_neg @ error
np.savetxt('modelparams.csv', clf.predict(np.eye(X_test.shape[1])), delimiter=",")
print('R-squared: {:.{p}f}'.format(clf.score(X_test, y_test), p=4))
print('Total error in sales quantity: {:.{p}f}'.format(sum(error), p=0))
print('Total error as a % of actual: {:.{p}f}%'.format(sum(error) / sum(y_test)*100, p=2))
print('Total error in sales quantity with zero min prediction: {:.{p}f}'.format(error_neg, p=0))
print('Total error as a % of actual with zero min prediction: {:.{p}f}%'.format((sum(error)+error_neg) / sum(y_test)*100, p=2))

R-squared: 0.8886
Total error in sales quantity: -7424
Total error as a % of actual: -4.20%
Total error in sales quantity with zero min prediction: 14
Total error as a % of actual with zero min prediction: -4.19%

An overall r-square of 85% is pretty good, I think, for really just dealing with three variables - store and item numbers and historical sales - and the overall error is very low on such a large number of test observations.

In [87]:

predicted = go.Bar(
    name = 'predicted',
    y = clf.predict(X_test)
)

actual = go.Bar(
    name = 'actual',
    y = y_test
)

layout = go.Layout(
    title='Actual vs. Predicted'
)

fig = go.Figure(data=[actual, predicted], layout=layout)
py.iplot(fig, filename='actual-vs-predicted')

Overall this says that this is a pretty good model. The total error over 40,000 individual store-item observations is really low. The graph above shows that we are missing on the big spikes, but overall the performance is good.

Next Steps¶

Further analyze data
Run model diagnostics
Analyze model errors by hand
Evaluate alternative model hyperparameters
Complete Azure implementation

In [88]:

df_final.loc[:100].to_csv('samplemodeldata.csv')

In [ ]:

np.eye(X_test.shape[1]) * clf.predict(X_test)