%reload_ext autoreload
%autoreload 2
#export
from nb_008 import *
To create the feature-engineered filed train_clean and test_clean from the initial data, run x_009a_rossman_data_clean
PATH = Path('data/rossmann/')
train_df = pd.read_feather(PATH/'train_clean')
test_df = pd.read_feather(PATH/'test_clean')
train_df.head()
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
'SchoolHoliday_fw', 'SchoolHoliday_bw']
cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h',
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
n = len(train_df); n
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]
small_train_df.head()
small_test_df.head()
#export
StrList = Collection[str]
@dataclass
class TabularTransform():
"A transform for tabular dataframe"
cat_names:StrList
cont_names:StrList
def __call__(self, df:DataFrame, test:bool=False):
"Applies the correct function to `df` depending if it's the training dataframe or not"
func = self.apply_test if test else self.apply_train
func(df)
def apply_train(self, df:DataFrame):
"Function applied to `df` if it's the train set"
raise NotImplementedError
def apply_test(self, df:DataFrame):
"Function applied to `df` if it's the test set"
self.apply_train(df)
#export
class Categorify(TabularTransform):
"Transforms the categorical variables to that type."
def apply_train(self, df:DataFrame):
self.categories = {}
for n in self.cat_names:
df[n] = df[n].astype('category').cat.as_ordered()
self.categories[n] = df[n].cat.categories
def apply_test(self, df:DataFrame):
for n in self.cat_names:
df[n] = pd.Categorical(df[n], categories=self.categories[n], ordered=True)
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)
small_test_df.head()
small_train_df['PromoInterval'].cat.codes
small_test_df['Store'].cat.codes
#export
FillStrategy = IntEnum('FillStrategy', 'MEDIAN COMMON CONSTANT')
@dataclass
class FillMissing(TabularTransform):
"Fill the missing values in continuous columns"
fill_strategy:FillStrategy=FillStrategy.MEDIAN
add_col:bool=True
fill_val:float=0.
def apply_train(self, df:DataFrame):
self.na_dict = {}
for name in self.cont_names:
if pd.isnull(df[name]).sum():
if self.add_col:
df[name+'_na'] = pd.isnull(df[name])
if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')
if self.fill_strategy == FillStrategy.MEDIAN: filler = df[name].median()
elif self.fill_strategy == FillStrategy.CONSTANT: filler = self.fill_val
else: filler = df[name].dropna().value_counts().idxmax()
df[name] = df[name].fillna(filler)
self.na_dict[name] = filler
def apply_test(self, df:DataFrame):
for name in self.cont_names:
if name in self.na_dict:
if self.add_col:
df[name+'_na'] = pd.isnull(df[name])
if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')
df[name] = df[name].fillna(self.na_dict[name])
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)
small_train_df[small_train_df['CompetitionDistance_na'] == True]
small_test_df[small_test_df['CompetitionDistance_na'] == True]
#export
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
OptStrList = Optional[StrList]
OptStats = Optional[Tuple[np.ndarray, np.ndarray]]
OptTabTfms = Optional[Collection[TabularTransform]]
OptDataFrame = Optional[DataFrame]
class TabularDataset(DatasetBase):
"Class for tabular data"
def __init__(self, df:DataFrame, dep_var:str, cat_names:OptStrList=None, cont_names:OptStrList=None,
stats:OptStats=None, log_output:bool=False):
if not is_numeric_dtype(df[dep_var]): df[dep_var] = df[dep_var].cat.codes
self.y = torch.tensor(df[dep_var].values)
if log_output: self.y = torch.log(self.y.float())
n = len(self.y)
if cat_names and len(cat_names) >= 1:
self.cats = np.stack([c.cat.codes.values for n,c in df[cat_names].items()], 1) + 1
else: self.cats = np.zeros((n,1))
self.cats = LongTensor(self.cats.astype(np.int64))
if cont_names and len(cont_names) >= 1:
self.conts = np.stack([c.astype('float32').values for n,c in df[cont_names].items()], 1)
means, stds = stats if stats is not None else (self.conts.mean(0), self.conts.std(0))
self.conts = (self.conts - means[None]) / stds[None]
self.stats = means,stds
else:
self.conts = np.zeros((n,1), dtype=np.float32)
self.stats = None
self.conts = FloatTensor(self.conts)
def __len__(self) -> int: return len(self.y)
def __getitem__(self, idx) -> Tuple[Tuple[LongTensor,FloatTensor], Tensor]:
return ((self.cats[idx], self.conts[idx]), self.y[idx])
@property
def c(self) -> int: return 1
@classmethod
def from_dataframe(cls, df:DataFrame, dep_var:str, tfms:OptTabTfms=None, cat_names:OptStrList=None,
cont_names:OptStrList=None, stats:OptStats=None, log_output:bool=False) -> 'TabularDataset':
"Creates a tabular dataframe from df after applying optional transforms"
if cat_names is None: cat_names = [n for n in df.columns if is_categorical_dtype(df[n])]
if cont_names is None: cont_names = [n for n in df.columns if is_numeric_dtype(df[n]) and not n==dep_var]
if tfms is None: tfms = []
for i,tfm in enumerate(tfms):
if isinstance(tfm, TabularTransform): tfm(df, test=True)
else:
tfm = tfm(cat_names, cont_names)
tfm(df)
tfms[i] = tfm
cat_names, cont_names = tfm.cat_names, tfm.cont_names
ds = cls(df, dep_var, cat_names, cont_names, stats, log_output)
ds.tfms,ds.cat_names,ds.cont_names = tfms,cat_names,cont_names
return ds
def data_from_tabulardf(path, train_df:DataFrame, valid_df:DataFrame, dep_var:str, test_df:OptDataFrame=None,
tfms:OptTabTfms=None, cat_names:OptStrList=None, cont_names:OptStrList=None,
stats:OptStats=None, log_output:bool=False, **kwargs) -> DataBunch:
"Creates a `DataBunch` from train/valid/test dataframes."
train_ds = TabularDataset.from_dataframe(train_df, dep_var, tfms, cat_names, cont_names, stats, log_output)
valid_ds = TabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names,
train_ds.cont_names, train_ds.stats, log_output)
datasets = [train_ds, valid_ds]
if test_df:
datasets.appendTabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names,
train_ds.cont_names, train_ds.stats, log_output)
return DataBunch.create(*datasets, path=path, **kwargs)
train_df = pd.read_feather(PATH/'train_clean')
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]
dep_var = 'Sales'
tfms = [FillMissing, Categorify] #Fillmissing first so that the added columns are categorified
train_ds = TabularDataset.from_dataframe(small_train_df, dep_var, tfms, small_cat_vars,
small_cont_vars, log_output=True)
valid_ds = TabularDataset.from_dataframe(small_test_df, dep_var, train_ds.tfms, train_ds.cat_names,
train_ds.cont_names, train_ds.stats, log_output=True)
train_ds[2]
train_ds.stats, valid_ds.stats
small_train_df.head()
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
'SchoolHoliday_fw', 'SchoolHoliday_bw']
cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h',
'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']
dep_var = 'Sales'
train_df = pd.read_feather(PATH/'train_clean')
train_df = train_df[cat_vars+cont_vars+[dep_var, 'Date']].copy()
test_df['Date'].min(), test_df['Date'].max()
len(test_df)
cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max()
cut
train_df = train_df.set_index('Date')
train_df,valid_df = train_df[cut:], train_df[:cut]
len(train_df),len(valid_df)
tfms = [FillMissing, Categorify]
data = data_from_tabulardf(PATH, train_df, valid_df, dep_var, tfms=[FillMissing, Categorify], cat_names=cat_vars,
cont_names=cont_vars, log_output=True, num_workers=0)
#export
ListSizes = Collection[Tuple[int,int]]
OptRange = Optional[Tuple[float,float]]
class TabularModel(nn.Module):
"Basic model for tabular data"
def __init__(self, emb_szs:ListSizes, n_cont:int, out_sz:int, layers:Collection[int], drops:Collection[float],
emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, is_reg:bool=False, is_multi:bool=False):
super().__init__()
self.embeds = nn.ModuleList([get_embedding(ni, nf) for ni,nf in emb_szs])
self.emb_drop = nn.Dropout(emb_drop)
self.bn_cont = nn.BatchNorm1d(n_cont)
n_emb = sum(e.embedding_dim for e in self.embeds)
self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range
if is_reg: final_act = None if y_range is None else nn.Sigmoid()
else: final_act = nn.LogSoftmax() if is_multi else nn.Sigmoid()
sizes = [n_emb + n_cont] + layers + [out_sz]
actns = [nn.ReLU(inplace=True)] * (len(sizes)-2) + [final_act]
layers = []
for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+drops,actns)):
layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)
self.layers = nn.Sequential(*layers)
def forward(self, x_cat:Tensor, x_cont:Tensor) -> Tensor:
if self.n_emb != 0:
x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
x = torch.cat(x, 1)
x = self.emb_drop(x)
if self.n_cont != 0:
x_cont = self.bn_cont(x_cont)
x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
x = self.layers(x)
if self.y_range is not None: x = (self.y_range[1] - self.y_range[0]) * x + self.y_range[0]
return x.squeeze()
cat_szs = [len(train_df[n].cat.categories)+1 for n in cat_vars]
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]
emb_szs
max_log_y = np.log(np.max(train_df['Sales']))
y_range = torch.tensor([0, max_log_y*1.2], device=default_device)
model = TabularModel(emb_szs, len(cont_vars), 1, [1000,500], [0.001,0.01], emb_drop=0.04, y_range=y_range, is_reg=True)
model
#export
def exp_rmspe(pred:Tensor, targ:Tensor) -> Rank0Tensor:
pred, targ = torch.exp(pred), torch.exp(targ)
pct_var = (targ - pred)/targ
return torch.sqrt((pct_var**2).mean())
learn = Learner(data, model)
learn.loss_fn = F.mse_loss
learn.metrics = [exp_rmspe]
learn.lr_find()
learn.recorder.plot()
learn.fit_one_cycle(5, 1e-3, wd=0.2, pct_start=0.2)
learn.fit_one_cycle(5, 1e-3, wd=0.1, pct_start=0.3)
with torch.no_grad():
pct_var,cnt = 0.,0
for x,y in learn.data.valid_dl:
out = learn.model(*x)
cnt += y.size(0)
y, out = torch.exp(y), torch.exp(out)
pct_var += ((y - out)/y).pow(2).sum()
torch.sqrt(pct_var/cnt).item()