In [ ]:

%reload_ext autoreload
%autoreload 2

In [ ]:

from fastai.tabular import *

Rossmann¶

Data preparation¶

To create the feature-engineered train_clean and test_clean from the Kaggle competition data, run rossman_data_clean.ipynb. One important step that deals with time series is this:

add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)

In [ ]:

path = Config().data_path()/'rossmann'
train_df = pd.read_pickle(path/'train_clean')

In [ ]:

train_df.head().T

Out[ ]:

	0	1	2	3	4
index	0	1	2	3	4
Store	1	2	3	4	5
DayOfWeek	5	5	5	5	5
Date	2015-07-31	2015-07-31	2015-07-31	2015-07-31	2015-07-31
Sales	5263	6064	8314	13995	4822
Customers	555	625	821	1498	559
Open	1	1	1	1	1
Promo	1	1	1	1	1
StateHoliday	False	False	False	False	False
SchoolHoliday	1	1	1	1	1
Year	2015	2015	2015	2015	2015
Month	7	7	7	7	7
Week	31	31	31	31	31
Day	31	31	31	31	31
Dayofweek	4	4	4	4	4
Dayofyear	212	212	212	212	212
Is_month_end	True	True	True	True	True
Is_month_start	False	False	False	False	False
Is_quarter_end	False	False	False	False	False
Is_quarter_start	False	False	False	False	False
Is_year_end	False	False	False	False	False
Is_year_start	False	False	False	False	False
Elapsed	1438300800	1438300800	1438300800	1438300800	1438300800
StoreType	c	a	a	c	a
Assortment	a	a	a	c	a
CompetitionDistance	1270	570	14130	620	29910
CompetitionOpenSinceMonth	9	11	12	9	4
CompetitionOpenSinceYear	2008	2007	2006	2009	2015
Promo2	0	1	1	0	0
Promo2SinceWeek	1	13	14	1	1
...	...	...	...	...	...
Min_Sea_Level_PressurehPa	1015	1017	1017	1014	1016
Max_VisibilityKm	31	10	31	10	10
Mean_VisibilityKm	15	10	14	10	10
Min_VisibilitykM	10	10	10	10	10
Max_Wind_SpeedKm_h	24	14	14	23	14
Mean_Wind_SpeedKm_h	11	11	5	16	11
Max_Gust_SpeedKm_h	NaN	NaN	NaN	NaN	NaN
Precipitationmm	0	0	0	0	0
CloudCover	1	4	2	6	4
Events	Fog	Fog	Fog	NaN	NaN
WindDirDegrees	13	309	354	282	290
StateName	Hessen	Thueringen	NordrheinWestfalen	Berlin	Sachsen
CompetitionOpenSince	2008-09-15	2007-11-15	2006-12-15	2009-09-15	2015-04-15
CompetitionDaysOpen	2510	2815	3150	2145	107
CompetitionMonthsOpen	24	24	24	24	3
Promo2Since	1900-01-01	2010-03-29	2011-04-04	1900-01-01	1900-01-01
Promo2Days	0	1950	1579	0	0
Promo2Weeks	0	25	25	0	0
AfterSchoolHoliday	0	0	0	0	0
BeforeSchoolHoliday	0	0	0	0	0
AfterStateHoliday	57	67	57	67	57
BeforeStateHoliday	0	0	0	0	0
AfterPromo	0	0	0	0	0
BeforePromo	0	0	0	0	0
SchoolHoliday_bw	5	5	5	5	5
StateHoliday_bw	0	0	0	0	0
Promo_bw	5	5	5	5	5
SchoolHoliday_fw	7	1	5	1	1
StateHoliday_fw	0	0	0	0	0
Promo_fw	5	1	5	1	1

93 rows × 5 columns

In [ ]:

n = len(train_df); n

Out[ ]:

Experimenting with a sample¶

In [ ]:

idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]

In [ ]:

small_train_df.head()

Out[ ]:

	Store	DayOfWeek	PromoInterval	CompetitionDistance	Mean_Humidity	Sales
267	268	5	NaN	4520.0	67	7492
604	606	5	NaN	2260.0	61	7187
983	986	5	Feb,May,Aug,Nov	620.0	61	7051
1636	525	4	NaN	1870.0	55	9673
2348	123	3	NaN	16760.0	50	10007

In [ ]:

small_test_df.head()

Out[ ]:

	Store	DayOfWeek	PromoInterval	CompetitionDistance	Mean_Humidity	Sales
420510	829	3	NaN	110.0	55	6802
420654	973	3	Jan,Apr,Jul,Oct	330.0	59	6644
420990	194	2	Feb,May,Aug,Nov	16970.0	55	4720
421308	512	2	Mar,Jun,Sept,Dec	590.0	72	6248
421824	1029	2	NaN	1590.0	64	8004

In [ ]:

categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)

In [ ]:

small_test_df.head()

Out[ ]:

	Store	DayOfWeek	PromoInterval	CompetitionDistance	Mean_Humidity	Sales
420510	NaN	3	NaN	110.0	55	6802
420654	973.0	3	Jan,Apr,Jul,Oct	330.0	59	6644
420990	NaN	2	Feb,May,Aug,Nov	16970.0	55	4720
421308	512.0	2	Mar,Jun,Sept,Dec	590.0	72	6248
421824	1029.0	2	NaN	1590.0	64	8004

In [ ]:

small_train_df.PromoInterval.cat.categories

Out[ ]:

Index(['Feb,May,Aug,Nov', 'Jan,Apr,Jul,Oct', 'Mar,Jun,Sept,Dec'], dtype='object')

In [ ]:

small_train_df['PromoInterval'].cat.codes[:5]

Out[ ]:

267    -1
604    -1
983     0
1636   -1
2348   -1
dtype: int8

In [ ]:

fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)

In [ ]:

small_train_df[small_train_df['CompetitionDistance_na'] == True]

Out[ ]:

	Store	DayOfWeek	PromoInterval	CompetitionDistance	Mean_Humidity	Sales	CompetitionDistance_na
185749	622	2	NaN	2300.0	93	4508	True

Preparing full data set¶

In [ ]:

train_df = pd.read_pickle(path/'train_clean')
test_df = pd.read_pickle(path/'test_clean')

In [ ]:

len(train_df),len(test_df)

Out[ ]:

(844338, 41088)

In [ ]:

procs=[FillMissing, Categorify, Normalize]

In [ ]:

cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

In [ ]:

dep_var = 'Sales'
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()

In [ ]:

test_df['Date'].min(), test_df['Date'].max()

Out[ ]:

('2015-08-01', '2015-09-17')

In [ ]:

cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut

Out[ ]:

In [ ]:

valid_idx = range(cut)

In [ ]:

df[dep_var].head()

Out[ ]:

0     5263
1     6064
2     8314
3    13995
4     4822
Name: Sales, dtype: int64

In [ ]:

data = (TabularList.from_df(df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
                .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
                .databunch())

In [ ]:

doc(FloatList)

Model¶

In [ ]:

max_log_y = np.log(np.max(train_df['Sales'])*1.2)
y_range = torch.tensor([0, max_log_y], device=defaults.device)

In [ ]:

learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, 
                        y_range=y_range, metrics=exp_rmspe)

In [ ]:

learn.model

Out[ ]:

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(1116, 81)
    (1): Embedding(8, 5)
    (2): Embedding(4, 3)
    (3): Embedding(13, 7)
    (4): Embedding(32, 11)
    (5): Embedding(3, 3)
    (6): Embedding(26, 10)
    (7): Embedding(27, 10)
    (8): Embedding(5, 4)
    (9): Embedding(4, 3)
    (10): Embedding(4, 3)
    (11): Embedding(24, 9)
    (12): Embedding(9, 5)
    (13): Embedding(13, 7)
    (14): Embedding(53, 15)
    (15): Embedding(22, 9)
    (16): Embedding(7, 5)
    (17): Embedding(7, 5)
    (18): Embedding(4, 3)
    (19): Embedding(4, 3)
    (20): Embedding(9, 5)
    (21): Embedding(9, 5)
    (22): Embedding(3, 3)
    (23): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.04)
  (bn_cont): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=233, out_features=1000, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.001)
    (4): Linear(in_features=1000, out_features=500, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.01)
    (8): Linear(in_features=500, out_features=1, bias=True)
  )
)

In [ ]:

len(data.train_ds.cont_names)

Out[ ]:

In [ ]:

learn.lr_find()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

In [ ]:

learn.recorder.plot()

In [ ]:

learn.fit_one_cycle(5, 1e-3, wd=0.2)

Total time: 11:27

epoch	train_loss	valid_loss	exp_rmspe
1	0.023587	0.020941	0.140551
2	0.017678	0.023431	0.132211
3	0.017453	0.016929	0.120169
4	0.012608	0.016296	0.109245
5	0.010222	0.011238	0.105433

In [ ]:

learn.save('1')

In [ ]:

learn.recorder.plot_losses(skip_start=10000)

In [ ]:

learn.load('1');

In [ ]:

learn.fit_one_cycle(5, 3e-4)

Total time: 11:32

epoch	train_loss	valid_loss	exp_rmspe
1	0.012223	0.014312	0.116988
2	0.012001	0.017789	0.117619
3	0.011402	0.035596	0.114396
4	0.010067	0.015125	0.113652
5	0.009148	0.031326	0.116344

In [ ]:

learn.fit_one_cycle(5, 3e-4)

Total time: 11:31

epoch	train_loss	valid_loss	exp_rmspe
1	0.011840	0.013236	0.110483
2	0.010765	0.057664	0.129586
3	0.010101	0.042744	0.111584
4	0.008820	0.116893	0.135458
5	0.009144	0.017969	0.126323

(10th place in the competition was 0.108)

In [ ]:

test_preds=learn.get_preds(DatasetType.Test)
test_df["Sales"]=np.exp(test_preds[0].data).numpy().T[0]
test_df[["Id","Sales"]]=test_df[["Id","Sales"]].astype("int")
test_df[["Id","Sales"]].to_csv("rossmann_submission.csv",index=False)