%reload_ext autoreload
%autoreload 2
from fastai import *
from fastai.tabular import *
To create the feature-engineered filed train_clean and test_clean from the initial data, run x_009a_rossman_data_clean
import pyarrow
path = Path('data/rossmann/')
train_df = pd.read_feather(path/'train_clean')
test_df = pd.read_feather(path/'test_clean')
/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/io/feather_format.py:112: FutureWarning: `nthreads` argument is deprecated, pass `use_threads` instead return feather.read_dataframe(path, nthreads=nthreads)
train_df.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
index | 0 | 1 | 2 | 3 | 4 |
Store | 1 | 2 | 3 | 4 | 5 |
DayOfWeek | 5 | 5 | 5 | 5 | 5 |
Date | 2015-07-31 00:00:00 | 2015-07-31 00:00:00 | 2015-07-31 00:00:00 | 2015-07-31 00:00:00 | 2015-07-31 00:00:00 |
Sales | 5263 | 6064 | 8314 | 13995 | 4822 |
Customers | 555 | 625 | 821 | 1498 | 559 |
Open | 1 | 1 | 1 | 1 | 1 |
Promo | 1 | 1 | 1 | 1 | 1 |
StateHoliday | False | False | False | False | False |
SchoolHoliday | 1 | 1 | 1 | 1 | 1 |
Year | 2015 | 2015 | 2015 | 2015 | 2015 |
Month | 7 | 7 | 7 | 7 | 7 |
Week | 31 | 31 | 31 | 31 | 31 |
Day | 31 | 31 | 31 | 31 | 31 |
Dayofweek | 4 | 4 | 4 | 4 | 4 |
Dayofyear | 212 | 212 | 212 | 212 | 212 |
Is_month_end | True | True | True | True | True |
Is_month_start | False | False | False | False | False |
Is_quarter_end | False | False | False | False | False |
Is_quarter_start | False | False | False | False | False |
Is_year_end | False | False | False | False | False |
Is_year_start | False | False | False | False | False |
Elapsed | 1438300800 | 1438300800 | 1438300800 | 1438300800 | 1438300800 |
StoreType | c | a | a | c | a |
Assortment | a | a | a | c | a |
CompetitionDistance | 1270 | 570 | 14130 | 620 | 29910 |
CompetitionOpenSinceMonth | 9 | 11 | 12 | 9 | 4 |
CompetitionOpenSinceYear | 2008 | 2007 | 2006 | 2009 | 2015 |
Promo2 | 0 | 1 | 1 | 0 | 0 |
Promo2SinceWeek | 1 | 13 | 14 | 1 | 1 |
... | ... | ... | ... | ... | ... |
Min_Sea_Level_PressurehPa | 1015 | 1017 | 1017 | 1014 | 1016 |
Max_VisibilityKm | 31 | 10 | 31 | 10 | 10 |
Mean_VisibilityKm | 15 | 10 | 14 | 10 | 10 |
Min_VisibilitykM | 10 | 10 | 10 | 10 | 10 |
Max_Wind_SpeedKm_h | 24 | 14 | 14 | 23 | 14 |
Mean_Wind_SpeedKm_h | 11 | 11 | 5 | 16 | 11 |
Max_Gust_SpeedKm_h | NaN | NaN | NaN | NaN | NaN |
Precipitationmm | 0 | 0 | 0 | 0 | 0 |
CloudCover | 1 | 4 | 2 | 6 | 4 |
Events | Fog | Fog | Fog | None | None |
WindDirDegrees | 13 | 309 | 354 | 282 | 290 |
StateName | Hessen | Thueringen | NordrheinWestfalen | Berlin | Sachsen |
CompetitionOpenSince | 2008-09-15 00:00:00 | 2007-11-15 00:00:00 | 2006-12-15 00:00:00 | 2009-09-15 00:00:00 | 2015-04-15 00:00:00 |
CompetitionDaysOpen | 2510 | 2815 | 3150 | 2145 | 107 |
CompetitionMonthsOpen | 24 | 24 | 24 | 24 | 3 |
Promo2Since | 1900-01-01 00:00:00 | 2010-03-29 00:00:00 | 2011-04-04 00:00:00 | 1900-01-01 00:00:00 | 1900-01-01 00:00:00 |
Promo2Days | 0 | 1950 | 1579 | 0 | 0 |
Promo2Weeks | 0 | 25 | 25 | 0 | 0 |
AfterSchoolHoliday | 0 | 0 | 0 | 0 | 0 |
BeforeSchoolHoliday | 0 | 0 | 0 | 0 | 0 |
AfterStateHoliday | 57 | 67 | 57 | 67 | 57 |
BeforeStateHoliday | 0 | 0 | 0 | 0 | 0 |
AfterPromo | 0 | 0 | 0 | 0 | 0 |
BeforePromo | 0 | 0 | 0 | 0 | 0 |
SchoolHoliday_bw | 5 | 5 | 5 | 5 | 5 |
StateHoliday_bw | 0 | 0 | 0 | 0 | 0 |
Promo_bw | 5 | 5 | 5 | 5 | 5 |
SchoolHoliday_fw | 7 | 1 | 5 | 1 | 1 |
StateHoliday_fw | 0 | 0 | 0 | 0 | 0 |
Promo_fw | 5 | 1 | 5 | 1 | 1 |
93 rows × 5 columns
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
'SchoolHoliday_fw', 'SchoolHoliday_bw']
cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h',
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
n = len(train_df); n
844338
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]
small_train_df.head()
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
---|---|---|---|---|---|---|
720 | 722 | 5 | None | 50.0 | 67 | 9349 |
761 | 763 | 5 | None | 32240.0 | 61 | 8022 |
1445 | 334 | 4 | Mar,Jun,Sept,Dec | 4040.0 | 73 | 6050 |
2302 | 77 | 3 | Jan,Apr,Jul,Oct | 1090.0 | 54 | 7865 |
2424 | 199 | 3 | Mar,Jun,Sept,Dec | 6360.0 | 63 | 9121 |
small_test_df.head()
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
---|---|---|---|---|---|---|
418845 | 276 | 4 | Mar,Jun,Sept,Dec | 2960.0 | 51 | 4892 |
418998 | 429 | 4 | Jan,Apr,Jul,Oct | 16350.0 | 67 | 5242 |
419398 | 830 | 4 | Jan,Apr,Jul,Oct | 6320.0 | 51 | 6087 |
420007 | 325 | 3 | Feb,May,Aug,Nov | 350.0 | 59 | 7110 |
420692 | 1011 | 3 | Feb,May,Aug,Nov | 490.0 | 59 | 9483 |
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)
small_test_df.head()
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | |
---|---|---|---|---|---|---|
418845 | 276.0 | 4 | Mar,Jun,Sept,Dec | 2960.0 | 51 | 4892 |
418998 | 429.0 | 4 | Jan,Apr,Jul,Oct | 16350.0 | 67 | 5242 |
419398 | 830.0 | 4 | Jan,Apr,Jul,Oct | 6320.0 | 51 | 6087 |
420007 | 325.0 | 3 | Feb,May,Aug,Nov | 350.0 | 59 | 7110 |
420692 | NaN | 3 | Feb,May,Aug,Nov | 490.0 | 59 | 9483 |
small_train_df['PromoInterval'].cat.codes[:5]
720 -1 761 -1 1445 2 2302 1 2424 2 dtype: int8
small_test_df['Store'].cat.codes[:5]
418845 147 418998 234 419398 481 420007 173 420692 -1 dtype: int16
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)
small_train_df[small_train_df['CompetitionDistance_na'] == True]
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | CompetitionDistance_na | |
---|---|---|---|---|---|---|---|
18160 | 291 | 1 | NaN | 2620.0 | 83 | 12663 | True |
36083 | 291 | 3 | NaN | 2620.0 | 77 | 5479 | True |
88124 | 291 | 1 | NaN | 2620.0 | 77 | 10660 | True |
311084 | 291 | 3 | NaN | 2620.0 | 73 | 9244 | True |
331651 | 291 | 5 | NaN | 2620.0 | 81 | 6994 | True |
small_test_df[small_test_df['CompetitionDistance_na'] == True]
Store | DayOfWeek | PromoInterval | CompetitionDistance | Mean_Humidity | Sales | CompetitionDistance_na | |
---|---|---|---|---|---|---|---|
584834 | NaN | 2 | Feb,May,Aug,Nov | 2620.0 | 96 | 4772 | True |
611734 | NaN | 1 | Feb,May,Aug,Nov | 2620.0 | 75 | 6035 | True |
745902 | NaN | 3 | NaN | 2620.0 | 70 | 3654 | True |
760633 | NaN | 2 | Feb,May,Aug,Nov | 2620.0 | 83 | 3179 | True |
815761 | 291.0 | 4 | NaN | 2620.0 | 66 | 7531 | True |
TODO: add something about Normalize
train_df = pd.read_feather(path/'train_clean')
/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/io/feather_format.py:112: FutureWarning: `nthreads` argument is deprecated, pass `use_threads` instead return feather.read_dataframe(path, nthreads=nthreads)
procs=[FillMissing, Categorify, Normalize]
cat_names = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
'SchoolHoliday_fw', 'SchoolHoliday_bw']
cont_names = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h',
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
dep_var = 'Sales'
train_df = pd.read_feather(path/'train_clean')
df = train_df[cat_vars+cont_vars+[dep_var, 'Date']].copy()
/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/io/feather_format.py:112: FutureWarning: `nthreads` argument is deprecated, pass `use_threads` instead return feather.read_dataframe(path, nthreads=nthreads)
test_df['Date'].min(), test_df['Date'].max()
(Timestamp('2015-08-01 00:00:00'), Timestamp('2015-09-17 00:00:00'))
len(test_df)
41088
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut
41395
valid_idx = range(cut)
data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
.split_by_idx(valid_idx)
.label_from_df(cols=dep_var, label_cls=FloatList, log=True)
.databunch())
max_log_y = np.log(np.max(train_df['Sales']))
y_range = torch.tensor([0, max_log_y*1.2], device=defaults.device)
emb_szs = data.get_emb_szs({})
model = TabularModel(emb_szs, len(cont_vars), 1, [1000,500], [0.001,0.01], emb_drop=0.04, y_range=y_range)
model
TabularModel( (embeds): ModuleList( (0): Embedding(1116, 50) (1): Embedding(8, 5) (2): Embedding(4, 3) (3): Embedding(13, 7) (4): Embedding(32, 17) (5): Embedding(3, 2) (6): Embedding(26, 14) (7): Embedding(27, 14) (8): Embedding(5, 3) (9): Embedding(4, 3) (10): Embedding(4, 3) (11): Embedding(24, 13) (12): Embedding(9, 5) (13): Embedding(13, 7) (14): Embedding(53, 27) (15): Embedding(22, 12) (16): Embedding(7, 4) (17): Embedding(7, 4) (18): Embedding(4, 3) (19): Embedding(4, 3) (20): Embedding(9, 5) (21): Embedding(9, 5) (22): Embedding(3, 2) (23): Embedding(3, 2) ) (emb_drop): Dropout(p=0.04) (bn_cont): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (layers): Sequential( (0): Linear(in_features=229, out_features=1000, bias=True) (1): ReLU(inplace) (2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (3): Dropout(p=0.001) (4): Linear(in_features=1000, out_features=500, bias=True) (5): ReLU(inplace) (6): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (7): Dropout(p=0.01) (8): Linear(in_features=500, out_features=1, bias=True) ) )
[len(v) for k,v in data.train_ds.classes.items()]
[1115, 7, 3, 12, 31, 2, 25, 26, 4, 3, 3, 23, 8, 12, 52, 21, 6, 6, 3, 3, 8, 8, 2, 2]
len(data.train_ds.cont_names)
16
learn = Learner(data, model)
learn.loss_fn = F.mse_loss
learn.metrics = [exp_rmspe]
learn.lr_find()
LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
learn.recorder.plot()
learn.fit_one_cycle(5, 1e-3, wd=0.2, pct_start=0.2)
Total time: 13:27 epoch train_loss valid_loss exp_rmspe 1 0.021706 0.019131 0.586892 (02:38) 2 0.019761 0.016307 0.631732 (02:42) 3 0.016764 0.016188 0.644211 (02:42) 4 0.012963 0.011598 0.630723 (02:42) 5 0.010889 0.011673 0.613048 (02:42)
learn.fit_one_cycle(5, 1e-3, wd=0.1, pct_start=0.3)
with torch.no_grad():
pct_var,cnt = 0.,0
for x,y in learn.data.valid_dl:
out = learn.model(*x)
cnt += y.size(0)
y, out = torch.exp(y), torch.exp(out)
pct_var += ((y - out)/y).pow(2).sum()
torch.sqrt(pct_var/cnt).item()
6.3370771408081055