In [ ]:
%reload_ext autoreload
%autoreload 2
In [ ]:
from fastai.tabular import *

Rossmann

Data preparation

To create the feature-engineered train_clean and test_clean from the Kaggle competition data, run rossman_data_clean.ipynb. One important step that deals with time series is this:

add_datepart(train, "Date", drop=False)
add_datepart(test, "Date", drop=False)
In [ ]:
path = Config().data_path()/'rossmann'
train_df = pd.read_pickle(path/'train_clean')
In [ ]:
train_df.head().T
Out[ ]:
0 1 2 3 4
index 0 1 2 3 4
Store 1 2 3 4 5
DayOfWeek 5 5 5 5 5
Date 2015-07-31 2015-07-31 2015-07-31 2015-07-31 2015-07-31
Sales 5263 6064 8314 13995 4822
Customers 555 625 821 1498 559
Open 1 1 1 1 1
Promo 1 1 1 1 1
StateHoliday False False False False False
SchoolHoliday 1 1 1 1 1
Year 2015 2015 2015 2015 2015
Month 7 7 7 7 7
Week 31 31 31 31 31
Day 31 31 31 31 31
Dayofweek 4 4 4 4 4
Dayofyear 212 212 212 212 212
Is_month_end True True True True True
Is_month_start False False False False False
Is_quarter_end False False False False False
Is_quarter_start False False False False False
Is_year_end False False False False False
Is_year_start False False False False False
Elapsed 1438300800 1438300800 1438300800 1438300800 1438300800
StoreType c a a c a
Assortment a a a c a
CompetitionDistance 1270 570 14130 620 29910
CompetitionOpenSinceMonth 9 11 12 9 4
CompetitionOpenSinceYear 2008 2007 2006 2009 2015
Promo2 0 1 1 0 0
Promo2SinceWeek 1 13 14 1 1
... ... ... ... ... ...
Min_Sea_Level_PressurehPa 1015 1017 1017 1014 1016
Max_VisibilityKm 31 10 31 10 10
Mean_VisibilityKm 15 10 14 10 10
Min_VisibilitykM 10 10 10 10 10
Max_Wind_SpeedKm_h 24 14 14 23 14
Mean_Wind_SpeedKm_h 11 11 5 16 11
Max_Gust_SpeedKm_h NaN NaN NaN NaN NaN
Precipitationmm 0 0 0 0 0
CloudCover 1 4 2 6 4
Events Fog Fog Fog NaN NaN
WindDirDegrees 13 309 354 282 290
StateName Hessen Thueringen NordrheinWestfalen Berlin Sachsen
CompetitionOpenSince 2008-09-15 2007-11-15 2006-12-15 2009-09-15 2015-04-15
CompetitionDaysOpen 2510 2815 3150 2145 107
CompetitionMonthsOpen 24 24 24 24 3
Promo2Since 1900-01-01 2010-03-29 2011-04-04 1900-01-01 1900-01-01
Promo2Days 0 1950 1579 0 0
Promo2Weeks 0 25 25 0 0
AfterSchoolHoliday 0 0 0 0 0
BeforeSchoolHoliday 0 0 0 0 0
AfterStateHoliday 57 67 57 67 57
BeforeStateHoliday 0 0 0 0 0
AfterPromo 0 0 0 0 0
BeforePromo 0 0 0 0 0
SchoolHoliday_bw 5 5 5 5 5
StateHoliday_bw 0 0 0 0 0
Promo_bw 5 5 5 5 5
SchoolHoliday_fw 7 1 5 1 1
StateHoliday_fw 0 0 0 0 0
Promo_fw 5 1 5 1 1

93 rows × 5 columns

In [ ]:
n = len(train_df); n
Out[ ]:
844338

Experimenting with a sample

In [ ]:
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]
In [ ]:
small_train_df.head()
Out[ ]:
Store DayOfWeek PromoInterval CompetitionDistance Mean_Humidity Sales
267 268 5 NaN 4520.0 67 7492
604 606 5 NaN 2260.0 61 7187
983 986 5 Feb,May,Aug,Nov 620.0 61 7051
1636 525 4 NaN 1870.0 55 9673
2348 123 3 NaN 16760.0 50 10007
In [ ]:
small_test_df.head()
Out[ ]:
Store DayOfWeek PromoInterval CompetitionDistance Mean_Humidity Sales
420510 829 3 NaN 110.0 55 6802
420654 973 3 Jan,Apr,Jul,Oct 330.0 59 6644
420990 194 2 Feb,May,Aug,Nov 16970.0 55 4720
421308 512 2 Mar,Jun,Sept,Dec 590.0 72 6248
421824 1029 2 NaN 1590.0 64 8004
In [ ]:
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)
In [ ]:
small_test_df.head()
Out[ ]:
Store DayOfWeek PromoInterval CompetitionDistance Mean_Humidity Sales
420510 NaN 3 NaN 110.0 55 6802
420654 973.0 3 Jan,Apr,Jul,Oct 330.0 59 6644
420990 NaN 2 Feb,May,Aug,Nov 16970.0 55 4720
421308 512.0 2 Mar,Jun,Sept,Dec 590.0 72 6248
421824 1029.0 2 NaN 1590.0 64 8004
In [ ]:
small_train_df.PromoInterval.cat.categories
Out[ ]:
Index(['Feb,May,Aug,Nov', 'Jan,Apr,Jul,Oct', 'Mar,Jun,Sept,Dec'], dtype='object')
In [ ]:
small_train_df['PromoInterval'].cat.codes[:5]
Out[ ]:
267    -1
604    -1
983     0
1636   -1
2348   -1
dtype: int8
In [ ]:
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)
In [ ]:
small_train_df[small_train_df['CompetitionDistance_na'] == True]
Out[ ]:
Store DayOfWeek PromoInterval CompetitionDistance Mean_Humidity Sales CompetitionDistance_na
185749 622 2 NaN 2300.0 93 4508 True

Preparing full data set

In [ ]:
train_df = pd.read_pickle(path/'train_clean')
test_df = pd.read_pickle(path/'test_clean')
In [ ]:
len(train_df),len(test_df)
Out[ ]:
(844338, 41088)
In [ ]:
procs=[FillMissing, Categorify, Normalize]
In [ ]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
In [ ]:
dep_var = 'Sales'
df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()
In [ ]:
test_df['Date'].min(), test_df['Date'].max()
Out[ ]:
('2015-08-01', '2015-09-17')
In [ ]:
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut
Out[ ]:
41395
In [ ]:
valid_idx = range(cut)
In [ ]:
df[dep_var].head()
Out[ ]:
0     5263
1     6064
2     8314
3    13995
4     4822
Name: Sales, dtype: int64
In [ ]:
data = (TabularList.from_df(df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs,)
                .split_by_idx(valid_idx)
                .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
                .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars))
                .databunch())
In [ ]:
doc(FloatList)

Model

In [ ]:
max_log_y = np.log(np.max(train_df['Sales'])*1.2)
y_range = torch.tensor([0, max_log_y], device=defaults.device)
In [ ]:
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, 
                        y_range=y_range, metrics=exp_rmspe)
In [ ]:
learn.model
Out[ ]:
TabularModel(
  (embeds): ModuleList(
    (0): Embedding(1116, 81)
    (1): Embedding(8, 5)
    (2): Embedding(4, 3)
    (3): Embedding(13, 7)
    (4): Embedding(32, 11)
    (5): Embedding(3, 3)
    (6): Embedding(26, 10)
    (7): Embedding(27, 10)
    (8): Embedding(5, 4)
    (9): Embedding(4, 3)
    (10): Embedding(4, 3)
    (11): Embedding(24, 9)
    (12): Embedding(9, 5)
    (13): Embedding(13, 7)
    (14): Embedding(53, 15)
    (15): Embedding(22, 9)
    (16): Embedding(7, 5)
    (17): Embedding(7, 5)
    (18): Embedding(4, 3)
    (19): Embedding(4, 3)
    (20): Embedding(9, 5)
    (21): Embedding(9, 5)
    (22): Embedding(3, 3)
    (23): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.04)
  (bn_cont): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=233, out_features=1000, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.001)
    (4): Linear(in_features=1000, out_features=500, bias=True)
    (5): ReLU(inplace)
    (6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.01)
    (8): Linear(in_features=500, out_features=1, bias=True)
  )
)
In [ ]:
len(data.train_ds.cont_names)
Out[ ]:
16
In [ ]:
learn.lr_find()
LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
In [ ]:
learn.recorder.plot()
In [ ]:
learn.fit_one_cycle(5, 1e-3, wd=0.2)
Total time: 11:27

epoch train_loss valid_loss exp_rmspe
1 0.023587 0.020941 0.140551
2 0.017678 0.023431 0.132211
3 0.017453 0.016929 0.120169
4 0.012608 0.016296 0.109245
5 0.010222 0.011238 0.105433
In [ ]:
learn.save('1')
In [ ]:
learn.recorder.plot_losses(skip_start=10000)
In [ ]:
learn.load('1');
In [ ]:
learn.fit_one_cycle(5, 3e-4)
Total time: 11:32

epoch train_loss valid_loss exp_rmspe
1 0.012223 0.014312 0.116988
2 0.012001 0.017789 0.117619
3 0.011402 0.035596 0.114396
4 0.010067 0.015125 0.113652
5 0.009148 0.031326 0.116344
In [ ]:
learn.fit_one_cycle(5, 3e-4)
Total time: 11:31

epoch train_loss valid_loss exp_rmspe
1 0.011840 0.013236 0.110483
2 0.010765 0.057664 0.129586
3 0.010101 0.042744 0.111584
4 0.008820 0.116893 0.135458
5 0.009144 0.017969 0.126323

(10th place in the competition was 0.108)

In [ ]:
test_preds=learn.get_preds(DatasetType.Test)
test_df["Sales"]=np.exp(test_preds[0].data).numpy().T[0]
test_df[["Id","Sales"]]=test_df[["Id","Sales"]].astype("int")
test_df[["Id","Sales"]].to_csv("rossmann_submission.csv",index=False)