Notebook
train.onpromotion = train.onpromotion!='0' test.onpromotion = test.onpromotion!='0'
%add_datepart
def get_elapsed(fld, pre): day1 = np.timedelta64(1, 'D') last_date = np.datetime64() last_store = 0 res = [] for s,v,d in zip(df.Store.values,df[fld].values, df.Date.values): if s != last_store: last_date = np.datetime64() last_store = s if v: last_date = d res.append(((d-last_date).astype('timedelta64[D]') / day1).astype(int)) df[pre+fld] = res
%add_datepart
for v in contin_vars: joined[v] = joined[v].astype('float32') joined_test[v] = joined_test[v].astype('float32')
# check this cell function ? apply_cats(joined_test, joined)
samp_size = n joined_samp = joined.set_index("Date")samp_size = len(joined_samp) samp_size
%proc_df [x, y, nas, mapper(optional)]: x: x is the transformed version of df. x will not have the response variable and is entirely numeric. y: y is the response variable nas (handles missing values): returns a dictionary of which nas it created, and the associated median. mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continous variables which is then used for scaling of during test-time.
val_idx = np.flatnonzero( (df.index<=datetime.datetime(2016,8,16)) & (df.index>=datetime.datetime(2016,8,31)))
# Favorita # Normalized Weighted Root Mean Squared Logarithmic Error (NWRMSLE) # https://www.kaggle.com/tkm2261/eval-metric-and-evaluating-last-year-sales-bench WEIGHTS = def NWRMSLE(y, pred): y = y.clip(0, y.max()) pred = pred.clip(0, pred.max()) score = np.nansum(WEIGHTS * ((np.log1p(pred) - np.log1p(y)) ** 2)) / WEIGHTS.sum() return np.sqrt(score)
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars), 0.04, 1, [1000,500], [0.001,0.01], y_range=y_range) lr = 1e-3