%reload_ext autoreload %autoreload 2 #export from nb_008 import * PATH = Path('data/rossmann/') train_df = pd.read_feather(PATH/'train_clean') test_df = pd.read_feather(PATH/'test_clean') train_df.head() cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw'] cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE', 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday'] n = len(train_df); n idx = np.random.permutation(range(n))[:2000] idx.sort() small_train_df = train_df.iloc[idx[:1000]] small_test_df = train_df.iloc[idx[1000:]] small_cont_vars = ['CompetitionDistance', 'Mean_Humidity'] small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval'] small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']] small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']] small_train_df.head() small_test_df.head() #export StrList = Collection[str] @dataclass class TabularTransform(): "A transform for tabular dataframe" cat_names:StrList cont_names:StrList def __call__(self, df:DataFrame, test:bool=False): "Applies the correct function to `df` depending if it's the training dataframe or not" func = self.apply_test if test else self.apply_train func(df) def apply_train(self, df:DataFrame): "Function applied to `df` if it's the train set" raise NotImplementedError def apply_test(self, df:DataFrame): "Function applied to `df` if it's the test set" self.apply_train(df) #export class Categorify(TabularTransform): "Transforms the categorical variables to that type." def apply_train(self, df:DataFrame): self.categories = {} for n in self.cat_names: df[n] = df[n].astype('category').cat.as_ordered() self.categories[n] = df[n].cat.categories def apply_test(self, df:DataFrame): for n in self.cat_names: df[n] = pd.Categorical(df[n], categories=self.categories[n], ordered=True) categorify = Categorify(small_cat_vars, small_cont_vars) categorify(small_train_df) categorify(small_test_df, test=True) small_test_df.head() small_train_df['PromoInterval'].cat.codes small_test_df['Store'].cat.codes #export FillStrategy = IntEnum('FillStrategy', 'MEDIAN COMMON CONSTANT') @dataclass class FillMissing(TabularTransform): "Fill the missing values in continuous columns" fill_strategy:FillStrategy=FillStrategy.MEDIAN add_col:bool=True fill_val:float=0. def apply_train(self, df:DataFrame): self.na_dict = {} for name in self.cont_names: if pd.isnull(df[name]).sum(): if self.add_col: df[name+'_na'] = pd.isnull(df[name]) if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na') if self.fill_strategy == FillStrategy.MEDIAN: filler = df[name].median() elif self.fill_strategy == FillStrategy.CONSTANT: filler = self.fill_val else: filler = df[name].dropna().value_counts().idxmax() df[name] = df[name].fillna(filler) self.na_dict[name] = filler def apply_test(self, df:DataFrame): for name in self.cont_names: if name in self.na_dict: if self.add_col: df[name+'_na'] = pd.isnull(df[name]) if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na') df[name] = df[name].fillna(self.na_dict[name]) fill_missing = FillMissing(small_cat_vars, small_cont_vars) fill_missing(small_train_df) fill_missing(small_test_df, test=True) small_train_df[small_train_df['CompetitionDistance_na'] == True] small_test_df[small_test_df['CompetitionDistance_na'] == True] #export from pandas.api.types import is_numeric_dtype, is_categorical_dtype OptStrList = Optional[StrList] OptStats = Optional[Tuple[np.ndarray, np.ndarray]] OptTabTfms = Optional[Collection[TabularTransform]] OptDataFrame = Optional[DataFrame] class TabularDataset(DatasetBase): "Class for tabular data" def __init__(self, df:DataFrame, dep_var:str, cat_names:OptStrList=None, cont_names:OptStrList=None, stats:OptStats=None, log_output:bool=False): if not is_numeric_dtype(df[dep_var]): df[dep_var] = df[dep_var].cat.codes self.y = torch.tensor(df[dep_var].values) if log_output: self.y = torch.log(self.y.float()) n = len(self.y) if cat_names and len(cat_names) >= 1: self.cats = np.stack([c.cat.codes.values for n,c in df[cat_names].items()], 1) + 1 else: self.cats = np.zeros((n,1)) self.cats = LongTensor(self.cats.astype(np.int64)) if cont_names and len(cont_names) >= 1: self.conts = np.stack([c.astype('float32').values for n,c in df[cont_names].items()], 1) means, stds = stats if stats is not None else (self.conts.mean(0), self.conts.std(0)) self.conts = (self.conts - means[None]) / stds[None] self.stats = means,stds else: self.conts = np.zeros((n,1), dtype=np.float32) self.stats = None self.conts = FloatTensor(self.conts) def __len__(self) -> int: return len(self.y) def __getitem__(self, idx) -> Tuple[Tuple[LongTensor,FloatTensor], Tensor]: return ((self.cats[idx], self.conts[idx]), self.y[idx]) @property def c(self) -> int: return 1 @classmethod def from_dataframe(cls, df:DataFrame, dep_var:str, tfms:OptTabTfms=None, cat_names:OptStrList=None, cont_names:OptStrList=None, stats:OptStats=None, log_output:bool=False) -> 'TabularDataset': "Creates a tabular dataframe from df after applying optional transforms" if cat_names is None: cat_names = [n for n in df.columns if is_categorical_dtype(df[n])] if cont_names is None: cont_names = [n for n in df.columns if is_numeric_dtype(df[n]) and not n==dep_var] if tfms is None: tfms = [] for i,tfm in enumerate(tfms): if isinstance(tfm, TabularTransform): tfm(df, test=True) else: tfm = tfm(cat_names, cont_names) tfm(df) tfms[i] = tfm cat_names, cont_names = tfm.cat_names, tfm.cont_names ds = cls(df, dep_var, cat_names, cont_names, stats, log_output) ds.tfms,ds.cat_names,ds.cont_names = tfms,cat_names,cont_names return ds def data_from_tabulardf(path, train_df:DataFrame, valid_df:DataFrame, dep_var:str, test_df:OptDataFrame=None, tfms:OptTabTfms=None, cat_names:OptStrList=None, cont_names:OptStrList=None, stats:OptStats=None, log_output:bool=False, **kwargs) -> DataBunch: "Creates a `DataBunch` from train/valid/test dataframes." train_ds = TabularDataset.from_dataframe(train_df, dep_var, tfms, cat_names, cont_names, stats, log_output) valid_ds = TabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names, train_ds.cont_names, train_ds.stats, log_output) datasets = [train_ds, valid_ds] if test_df: datasets.appendTabularDataset.from_dataframe(valid_df, dep_var, train_ds.tfms, train_ds.cat_names, train_ds.cont_names, train_ds.stats, log_output) return DataBunch.create(*datasets, path=path, **kwargs) train_df = pd.read_feather(PATH/'train_clean') idx = np.random.permutation(range(n))[:2000] idx.sort() small_train_df = train_df.iloc[idx[:1000]] small_test_df = train_df.iloc[idx[1000:]] small_cont_vars = ['CompetitionDistance', 'Mean_Humidity'] small_cat_vars = ['Store', 'DayOfWeek', 'PromoInterval'] small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']] small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']] dep_var = 'Sales' tfms = [FillMissing, Categorify] #Fillmissing first so that the added columns are categorified train_ds = TabularDataset.from_dataframe(small_train_df, dep_var, tfms, small_cat_vars, small_cont_vars, log_output=True) valid_ds = TabularDataset.from_dataframe(small_test_df, dep_var, train_ds.tfms, train_ds.cat_names, train_ds.cont_names, train_ds.stats, log_output=True) train_ds[2] train_ds.stats, valid_ds.stats small_train_df.head() cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw'] cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE', 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday'] dep_var = 'Sales' train_df = pd.read_feather(PATH/'train_clean') train_df = train_df[cat_vars+cont_vars+[dep_var, 'Date']].copy() test_df['Date'].min(), test_df['Date'].max() len(test_df) cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max() cut train_df = train_df.set_index('Date') train_df,valid_df = train_df[cut:], train_df[:cut] len(train_df),len(valid_df) tfms = [FillMissing, Categorify] data = data_from_tabulardf(PATH, train_df, valid_df, dep_var, tfms=[FillMissing, Categorify], cat_names=cat_vars, cont_names=cont_vars, log_output=True, num_workers=0) #export ListSizes = Collection[Tuple[int,int]] OptRange = Optional[Tuple[float,float]] class TabularModel(nn.Module): "Basic model for tabular data" def __init__(self, emb_szs:ListSizes, n_cont:int, out_sz:int, layers:Collection[int], drops:Collection[float], emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, is_reg:bool=False, is_multi:bool=False): super().__init__() self.embeds = nn.ModuleList([get_embedding(ni, nf) for ni,nf in emb_szs]) self.emb_drop = nn.Dropout(emb_drop) self.bn_cont = nn.BatchNorm1d(n_cont) n_emb = sum(e.embedding_dim for e in self.embeds) self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range if is_reg: final_act = None if y_range is None else nn.Sigmoid() else: final_act = nn.LogSoftmax() if is_multi else nn.Sigmoid() sizes = [n_emb + n_cont] + layers + [out_sz] actns = [nn.ReLU(inplace=True)] * (len(sizes)-2) + [final_act] layers = [] for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+drops,actns)): layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act) self.layers = nn.Sequential(*layers) def forward(self, x_cat:Tensor, x_cont:Tensor) -> Tensor: if self.n_emb != 0: x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)] x = torch.cat(x, 1) x = self.emb_drop(x) if self.n_cont != 0: x_cont = self.bn_cont(x_cont) x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont x = self.layers(x) if self.y_range is not None: x = (self.y_range[1] - self.y_range[0]) * x + self.y_range[0] return x.squeeze() cat_szs = [len(train_df[n].cat.categories)+1 for n in cat_vars] emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs] emb_szs max_log_y = np.log(np.max(train_df['Sales'])) y_range = torch.tensor([0, max_log_y*1.2], device=default_device) model = TabularModel(emb_szs, len(cont_vars), 1, [1000,500], [0.001,0.01], emb_drop=0.04, y_range=y_range, is_reg=True) model #export def exp_rmspe(pred:Tensor, targ:Tensor) -> Rank0Tensor: pred, targ = torch.exp(pred), torch.exp(targ) pct_var = (targ - pred)/targ return torch.sqrt((pct_var**2).mean()) learn = Learner(data, model) learn.loss_fn = F.mse_loss learn.metrics = [exp_rmspe] learn.lr_find() learn.recorder.plot() learn.fit_one_cycle(5, 1e-3, wd=0.2, pct_start=0.2) learn.fit_one_cycle(5, 1e-3, wd=0.1, pct_start=0.3) with torch.no_grad(): pct_var,cnt = 0.,0 for x,y in learn.data.valid_dl: out = learn.model(*x) cnt += y.size(0) y, out = torch.exp(y), torch.exp(out) pct_var += ((y - out)/y).pow(2).sum() torch.sqrt(pct_var/cnt).item()