%load_ext autoreload %autoreload 2 %matplotlib inline #export from exp.nb_06 import * x_train,y_train,x_valid,y_valid = get_data() x_train,x_valid = normalize_to(x_train,x_valid) train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid) nh,bs = 50,512 c = y_train.max().item()+1 loss_func = F.cross_entropy data = DataBunch(*get_dls(train_ds, valid_ds, bs), c) mnist_view = view_tfm(1,28,28) cbfs = [Recorder, partial(AvgStatsCallback,accuracy), CudaCallback, partial(BatchTransformXCallback, mnist_view)] nfs = [8,16,32,64,64] learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs) %time run.fit(2, learn) class BatchNorm(nn.Module): def __init__(self, nf, mom=0.1, eps=1e-5): super().__init__() # NB: pytorch bn mom is opposite of what you'd expect self.mom,self.eps = mom,eps self.mults = nn.Parameter(torch.ones (nf,1,1)) self.adds = nn.Parameter(torch.zeros(nf,1,1)) self.register_buffer('vars', torch.ones(1,nf,1,1)) self.register_buffer('means', torch.zeros(1,nf,1,1)) def update_stats(self, x): m = x.mean((0,2,3), keepdim=True) v = x.var ((0,2,3), keepdim=True) self.means.lerp_(m, self.mom) self.vars.lerp_ (v, self.mom) return m,v def forward(self, x): if self.training: with torch.no_grad(): m,v = self.update_stats(x) else: m,v = self.means,self.vars x = (x-m) / (v+self.eps).sqrt() return x*self.mults + self.adds def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs): # No bias needed if using bn layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn), GeneralRelu(**kwargs)] if bn: layers.append(BatchNorm(nf)) return nn.Sequential(*layers) #export def init_cnn_(m, f): if isinstance(m, nn.Conv2d): f(m.weight, a=0.1) if getattr(m, 'bias', None) is not None: m.bias.data.zero_() for l in m.children(): init_cnn_(l, f) def init_cnn(m, uniform=False): f = init.kaiming_uniform_ if uniform else init.kaiming_normal_ init_cnn_(m, f) def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs): model = get_cnn_model(data, nfs, layer, **kwargs) init_cnn(model, uniform=uniform) return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func) learn,run = get_learn_run(nfs, data, 0.9, conv_layer, cbs=cbfs) with Hooks(learn.model, append_stats) as hooks: run.fit(1, learn) fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4)) for h in hooks[:-1]: ms,ss = h.stats ax0.plot(ms[:10]) ax1.plot(ss[:10]) h.remove() plt.legend(range(6)); fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4)) for h in hooks[:-1]: ms,ss = h.stats ax0.plot(ms) ax1.plot(ss) learn,run = get_learn_run(nfs, data, 1.0, conv_layer, cbs=cbfs) %time run.fit(3, learn) #export def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs): layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn), GeneralRelu(**kwargs)] if bn: layers.append(nn.BatchNorm2d(nf, eps=1e-5, momentum=0.1)) return nn.Sequential(*layers) learn,run = get_learn_run(nfs, data, 1., conv_layer, cbs=cbfs) %time run.fit(3, learn) sched = combine_scheds([0.3, 0.7], [sched_lin(0.6, 2.), sched_lin(2., 0.1)]) learn,run = get_learn_run(nfs, data, 0.9, conv_layer, cbs=cbfs +[partial(ParamScheduler,'lr', sched)]) run.fit(8, learn) class LayerNorm(nn.Module): __constants__ = ['eps'] def __init__(self, eps=1e-5): super().__init__() self.eps = eps self.mult = nn.Parameter(tensor(1.)) self.add = nn.Parameter(tensor(0.)) def forward(self, x): m = x.mean((1,2,3), keepdim=True) v = x.var ((1,2,3), keepdim=True) x = (x-m) / ((v+self.eps).sqrt()) return x*self.mult + self.add def conv_ln(ni, nf, ks=3, stride=2, bn=True, **kwargs): layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True), GeneralRelu(**kwargs)] if bn: layers.append(LayerNorm()) return nn.Sequential(*layers) learn,run = get_learn_run(nfs, data, 0.8, conv_ln, cbs=cbfs) %time run.fit(3, learn) class InstanceNorm(nn.Module): __constants__ = ['eps'] def __init__(self, nf, eps=1e-0): super().__init__() self.eps = eps self.mults = nn.Parameter(torch.ones (nf,1,1)) self.adds = nn.Parameter(torch.zeros(nf,1,1)) def forward(self, x): m = x.mean((2,3), keepdim=True) v = x.var ((2,3), keepdim=True) res = (x-m) / ((v+self.eps).sqrt()) return res*self.mults + self.adds def conv_in(ni, nf, ks=3, stride=2, bn=True, **kwargs): layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True), GeneralRelu(**kwargs)] if bn: layers.append(InstanceNorm(nf)) return nn.Sequential(*layers) learn,run = get_learn_run(nfs, data, 0.1, conv_in, cbs=cbfs) %time run.fit(3, learn) data = DataBunch(*get_dls(train_ds, valid_ds, 2), c) def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs): layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn), GeneralRelu(**kwargs)] if bn: layers.append(nn.BatchNorm2d(nf, eps=1e-5, momentum=0.1)) return nn.Sequential(*layers) learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs) %time run.fit(1, learn) class RunningBatchNorm(nn.Module): def __init__(self, nf, mom=0.1, eps=1e-5): super().__init__() self.mom,self.eps = mom,eps self.mults = nn.Parameter(torch.ones (nf,1,1)) self.adds = nn.Parameter(torch.zeros(nf,1,1)) self.register_buffer('sums', torch.zeros(1,nf,1,1)) self.register_buffer('sqrs', torch.zeros(1,nf,1,1)) self.register_buffer('batch', tensor(0.)) self.register_buffer('count', tensor(0.)) self.register_buffer('step', tensor(0.)) self.register_buffer('dbias', tensor(0.)) def update_stats(self, x): bs,nc,*_ = x.shape self.sums.detach_() self.sqrs.detach_() dims = (0,2,3) s = x.sum(dims, keepdim=True) ss = (x*x).sum(dims, keepdim=True) c = self.count.new_tensor(x.numel()/nc) mom1 = 1 - (1-self.mom)/math.sqrt(bs-1) self.mom1 = self.dbias.new_tensor(mom1) self.sums.lerp_(s, self.mom1) self.sqrs.lerp_(ss, self.mom1) self.count.lerp_(c, self.mom1) self.dbias = self.dbias*(1-self.mom1) + self.mom1 self.batch += bs self.step += 1 def forward(self, x): if self.training: self.update_stats(x) sums = self.sums sqrs = self.sqrs c = self.count if self.step<100: sums = sums / self.dbias sqrs = sqrs / self.dbias c = c / self.dbias means = sums/c vars = (sqrs/c).sub_(means*means) if bool(self.batch < 20): vars.clamp_min_(0.01) x = (x-means).div_((vars.add_(self.eps)).sqrt()) return x.mul_(self.mults).add_(self.adds) def conv_rbn(ni, nf, ks=3, stride=2, bn=True, **kwargs): layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn), GeneralRelu(**kwargs)] if bn: layers.append(RunningBatchNorm(nf)) return nn.Sequential(*layers) learn,run = get_learn_run(nfs, data, 0.4, conv_rbn, cbs=cbfs) %time run.fit(1, learn) data = DataBunch(*get_dls(train_ds, valid_ds, 32), c) learn,run = get_learn_run(nfs, data, 0.9, conv_rbn, cbs=cbfs +[partial(ParamScheduler,'lr', sched_lin(1., 0.2))]) %time run.fit(1, learn) nb_auto_export()