#!/usr/bin/env python
# coding: utf-8

# # CIFAR-10 Darknet

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# ## Data Download

# CIFAR-10 dataset website: https://www.cs.toronto.edu/~kriz/cifar.html
# 
# Direct download link for CIFAR-10 dataset in PNG format (instead of raw Python pickle format): http://files.fast.ai/data/cifar10.tgz

# In[37]:


get_ipython().system('aria2c --file-allocation=none -c -x 5 -s 5 -d data/ http://files.fast.ai/data/cifar10.tgz')


# In[38]:


get_ipython().system('tar -xzf data/cifar10.tgz --directory data/')


# **Setup directory and file paths**

# In[82]:


from fastai.conv_learner import *

PATH = Path('data/cifar10/')
os.makedirs(PATH, exist_ok=True)
torch.cuda.set_device(0)


# In[83]:


# My own codes for cutting down the training duration from ~3 hr to ~1hr
torch.backends.cudnn.benchmark = True


# Compute CIFAR10 dataset stats

# In[43]:


import torchvision.transforms as transforms

# ToTensor() converts image, whose elements ar in the range 0-255 to 0-1
train_transform = transforms.Compose([transforms.ToTensor()])
train_set = torchvision.datasets.CIFAR10(root='./cifar10', train=True, download=True, transform=train_transform)
# train_set.train_data returns numpy ndarray
# train_set.train_data.shape returns (50000, 32, 32, 3)
print(train_set.train_data.mean(axis=(0, 1, 2)) / 255)
print(train_set.train_data.std(axis=(0, 1, 2)) / 255)


# In[45]:


get_ipython().run_line_magic('rm', '-rf cifar10/')


# Build a network from scratch

# In[84]:


classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# these numbers are the averages and standard deviations for each channel in CIFAR10
stats = (np.array([ 0.4914 ,  0.48216,  0.44653]), np.array([ 0.24703,  0.24349,  0.26159]))

num_workers = num_cpus() // 2 # num cpus returns 4
bs = 256
sz = 32


# In[85]:


tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomFlip()], pad=sz // 8)
data = ImageClassifierData.from_paths(PATH, val_name='test', tfms=tfms, bs=bs)


# Architecture

# In[86]:


def conv_layer(ni, nf, ks=3, stride=1):
    return nn.Sequential(
        nn.Conv2d(in_channels=ni, out_channels=nf, kernel_size=ks, bias=False, stride=stride, padding=ks // 2),
        nn.BatchNorm2d(num_features=nf, momentum=0.01),
        nn.LeakyReLU(negative_slope=0.1, inplace=True)
    )


# In[87]:


class ResLayer(nn.Module):
    def __init__(self, ni):
        super().__init__()
        self.conv1 = conv_layer(ni, ni // 2, ks=1)
        self.conv2 = conv_layer(ni // 2, ni, ks=3)
        
    def forward(self, x):
        # Received error:
        # "RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation"
        # Others facing this problem: http://forums.fast.ai/t/part-2-lesson-12-wiki/15023/175?u=cedric
        #
        # Jeremy suspect if it only works on Pytorch 0.4:
        # http://forums.fast.ai/t/part-2-lesson-12-wiki/15023/176?u=cedric
        #
        # Disabled for now and replaced with the next line below which does it non in-place.
        # return x.add_(self.conv2(self.conv1(x)))
        return x.add(self.conv2(self.conv1(x)))


# In[88]:


class Darknet(nn.Module):
    def make_group_layer(self, ch_in, num_blocks, stride=1):
        return [conv_layer(ch_in, ch_in * 2, stride=stride)
               ] + [(ResLayer(ch_in * 2)) for i in range(num_blocks)]
    
    def __init__(self, num_blocks, num_classes, nf=32):
        super().__init__()
        layers = [conv_layer(3, nf, ks=3, stride=1)]
        for i, nb in enumerate(num_blocks):
            layers += self.make_group_layer(nf, nb, stride=2 - (i == 1))
            nf *= 2
        layers += [nn.AdaptiveAvgPool2d(1), Flatten(), nn.Linear(nf, num_classes)]
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)


# **Define `Darknet`**

# Create 5 group layers: the first one will contain 1 extra ResLayer, the second will contain 2, then 4, 6, 3 and we want to start with 32 filters. The first one of ResLayers will contain 32 filters, and there’ll just be one extra ResLayer. The second one, it’s going to double the number of filters because that’s what we do each time we have a new group layer. So the second one will have 64, and then 128, 256, 512 and that’ll be it. Nearly all of the network is going to be those bunches of layers and remember, every one of those group layers also has one convolution at the start. So then all we have is before that all happens, we are going to have one convolutional layer at the very start, and at the very end we are going to do our standard adaptive average pooling, flatten, and a linear layer to create the number of classes out at the end.

# In[126]:


m = Darknet([1, 2, 4, 6, 3], num_classes=10, nf=32)
# m = nn.DataParallel(m, [1, 2, 3]) # disabled for single GPU


# **Train the model**

# In[90]:


lr = 1.3


# In[127]:


learn = ConvLearner.from_model_data(m, data)
learn.crit = nn.CrossEntropyLoss()
learn.metrics = [accuracy]
wd = 1e-4


# Fit the model and time it

# In[77]:


# before torch.backends.cudnn.benchmark = True
get_ipython().run_line_magic('time', 'learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(20, 20, 0.95, 0.85))')


# In[92]:


# after torch.backends.cudnn.benchmark = True
get_ipython().run_line_magic('time', 'learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(20, 20, 0.95, 0.85))')
# note-to-self:
# a better name for cycle_len -> num_epochs_per_cycle
# The cycle_len parameter governs how long we're going to ride that cosine curve as we decrease the learning rate.
# Cycles are measured in epochs, so cycle_len=1 by itself would mean to continually decrease the learning rate over
# the course of one epoch, and then jump it back up. The cycle_mult parameter says to multiply the length of a cycle
# by something as soon as you finish one.
# So, here we're going to do 1 cycle, of lengths (in epochs): 30. So, 30 epochs in total,
# but our SGDR (not cosine LR because we're using CLR) no restarts.


# ### ================================== START DEBUG ==================================
# 
# #### Find Learning Rate

# In[93]:


learn.lr_find()


# In[95]:


learn.sched.plot()


# In[96]:


learn.sched.plot_lr()


# In[100]:


learn.sched.plot(0, 20)


# In[101]:


learn.lr_find2()


# In[120]:


learn.sched.plot(0, 85, smoothed=False)


# ### ================================== END DEBUG ==================================

# #### Experiment with different learning rate

# In[125]:


lr = 1.75e-5
get_ipython().run_line_magic('time', 'learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(20, 20, 0.95, 0.85))')


# In[128]:


lr = 2e-3
get_ipython().run_line_magic('time', 'learn.fit(lr, 1, wds=wd, cycle_len=30, use_clr_beta=(20, 20, 0.95, 0.85))')


# In[129]:


# My own code for testing

def get_TTA_accuracy(learn):
    preds, targs = learn.TTA()
    # combining the predictions across augmented and non augmented inputs
    preds = 0.6 * preds[0] + 0.4 * preds[1:].sum(0)
    return accuracy_np(preds, targs)


# In[130]:


get_TTA_accuracy(learn)


# In[131]:


# My own code for testing

learn.sched.plot_loss()


# In[132]:


# My own code for testing

learn.sched.plot_lr()


# In[133]:


learn.save('darknet_lr_3e-2')


# **_Note: I didn't try the other experiments in the original notebook._**