MNIST-Neural Network-Batch Normalization

In [1]:
# coding: utf-8
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import math

sys.path.append(os.pardir)
from deeplink.mnist import *
from deeplink.networks import *

Multilayer Neural Network Model (Two Hidden Layers) and Learing/Validation

Initializers

In [2]:
# coding: utf-8
import numpy as np

class Initializer:
    def __init__(self, params, params_size_list, use_batch_normalization=False):
        self.params = params
        self.params_size_list = params_size_list
        self.use_batch_normalization = use_batch_normalization

    def initialize_params(self):
        pass

    def get_params(self):
        return self.params


class Zero_Initializer(Initializer):
    def initialize_params(self, use_batch_normalization):
        for idx in range(1, len(self.params_size_list)):
            self.params['W' + str(idx)] = np.zeros(self.params_size_list[idx - 1], self.params_size_list[idx])
            self.params['b' + str(idx)] = np.zeros(self.params_size_list[idx])
            if self.use_batch_normalization and idx < len(self.params_size_list) - 1:
                self.params['gamma' + str(idx)] = np.zeros(self.params_size_list[idx])
                self.params['beta' + str(idx)] = np.zeros(self.params_size_list[idx])

class N1_Initializer(Initializer):
    def initialize_params(self):
        for idx in range(1, len(self.params_size_list)):
            self.params['W' + str(idx)] = np.random.randn(self.params_size_list[idx - 1], self.params_size_list[idx])
            self.params['b' + str(idx)] = np.random.randn(self.params_size_list[idx])
            if self.use_batch_normalization and idx < len(self.params_size_list) - 1:
                self.params['gamma' + str(idx)] = np.random.randn(self.params_size_list[idx])
                self.params['beta' + str(idx)] = np.random.randn(self.params_size_list[idx])

class N2_Initializer(Initializer):
    def initialize_params(self):
        for idx in range(1, len(self.params_size_list)):
            self.params['W' + str(idx)] = np.random.randn(self.params_size_list[idx - 1], self.params_size_list[idx]) * 0.01
            self.params['b' + str(idx)] = np.random.randn(self.params_size_list[idx]) * 0.01
            if self.use_batch_normalization and idx < len(self.params_size_list) - 1:
                self.params['gamma' + str(idx)] = np.random.randn(self.params_size_list[idx]) * 0.01
                self.params['beta' + str(idx)] = np.random.randn(self.params_size_list[idx]) * 0.01

class Xavier_Initializer(Initializer):
    def initialize_params(self):
        for idx in range(1, len(self.params_size_list)):
            self.params['W' + str(idx)] = np.random.randn(self.params_size_list[idx - 1], self.params_size_list[idx]) / np.sqrt(self.params_size_list[idx - 1])
            self.params['b' + str(idx)] = np.random.randn(self.params_size_list[idx]) / np.sqrt(self.params_size_list[idx - 1])
            if self.use_batch_normalization and idx < len(self.params_size_list) - 1:
                self.params['gamma' + str(idx)] = np.random.randn(self.params_size_list[idx]) / np.sqrt(self.params_size_list[idx - 1])
                self.params['beta' + str(idx)] = np.random.randn(self.params_size_list[idx]) / np.sqrt(self.params_size_list[idx - 1])


class He_Initializer(Initializer):
    def initialize_params(self):
        for idx in range(1, len(self.params_size_list)):
            self.params['W' + str(idx)] = np.random.randn(self.params_size_list[idx - 1], self.params_size_list[idx]) * np.sqrt(2) / np.sqrt(self.params_size_list[idx - 1])
            self.params['b' + str(idx)] = np.random.randn(self.params_size_list[idx]) * np.sqrt(2) / np.sqrt(self.params_size_list[idx - 1])
            if self.use_batch_normalization and idx < len(self.params_size_list) - 1:
                self.params['gamma' + str(idx)] = np.random.randn(self.params_size_list[idx]) * np.sqrt(2) / np.sqrt(self.params_size_list[idx - 1])
                self.params['beta' + str(idx)] = np.random.randn(self.params_size_list[idx]) * np.sqrt(2) / np.sqrt(self.params_size_list[idx - 1])

New Layer - Batch Normalization

In [3]:
class BatchNormalization:
    def __init__(self, gamma, beta, momentum=0.9, running_mean=None, running_var=None):
        self.gamma = gamma
        self.beta = beta
        self.momentum = momentum
        self.input_shape = None

        self.running_mean = running_mean
        self.running_var = running_var  
        
        self.batch_size = None
        self.xc = None
        self.std = None
        self.dgamma = None
        self.dbeta = None

    def forward(self, x, is_train=True):
        self.input_shape = x.shape
        if x.ndim != 2:
            N, C, H, W = x.shape
            x = x.reshape(N, -1)

        out = self.__forward(x, is_train)
        
        return out.reshape(*self.input_shape)
            
    def __forward(self, x, is_train):
        if self.running_mean is None:
            N, D = x.shape
            self.running_mean = np.zeros(D)
            self.running_var = np.zeros(D)
                        
        if is_train:
            mu = x.mean(axis=0)
            xc = x - mu
            var = np.mean(xc**2, axis=0)
            std = np.sqrt(var + 10e-7)
            xn = xc / std
            
            self.batch_size = x.shape[0]
            self.xc = xc
            self.xn = xn
            self.std = std
            self.running_mean = self.momentum * self.running_mean + (1-self.momentum) * mu
            self.running_var = self.momentum * self.running_var + (1-self.momentum) * var            
        else:
            xc = x - self.running_mean
            xn = xc / ((np.sqrt(self.running_var + 10e-7)))
            
        out = self.gamma * xn + self.beta 
        return out

    def backward(self, dout):
        if dout.ndim != 2:
            N, C, H, W = dout.shape
            dout = dout.reshape(N, -1)

        dx = self.__backward(dout)

        dx = dx.reshape(*self.input_shape)
        return dx

    def __backward(self, dout):
        dbeta = dout.sum(axis=0)
        dgamma = np.sum(self.xn * dout, axis=0)
        dxn = self.gamma * dout
        dxc = dxn / self.std
        dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0)
        dvar = 0.5 * dstd / self.std
        dxc += (2.0 / self.batch_size) * self.xc * dvar
        dmu = np.sum(dxc, axis=0)
        dx = dxc - dmu / self.batch_size
        
        self.dgamma = dgamma
        self.dbeta = dbeta
        
        return dx
In [4]:
activation_layers = {
    'Sigmoid': Sigmoid,
    'ReLU': ReLU
}

optimizers = {
    "SGD": SGD,
    "Momentum": Momentum,
    "Nesterov": Nesterov,
    "AdaGrad": AdaGrad,
    "RMSprop": RMSprop,
    "Adam": Adam
}

initializers = {
    'Zero': Zero_Initializer,
    'N1': N1_Initializer,
    'N2': N2_Initializer, # We will use this as a new initializer for supporting Batch Normalization
    'Xavier': Xavier_Initializer,
    'He': He_Initializer
}

Multi Layer Model Class

In [5]:
class MultiLayerNetExtended(MultiLayerNet):
    def __init__(self, input_size, hidden_size_list, output_size, activation='ReLU', initializer='N2', 
                 optimizer='AdaGrad', learning_rate=0.01, use_batch_normalization=False):
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size_list = hidden_size_list
        self.hidden_layer_num = len(hidden_size_list)
        
        self.use_batch_normalization = use_batch_normalization
        
        # Weight Initialization
        self.params = {}
        self.weight_initialization(initializer)
        
        # Layering
        self.layers = OrderedDict()
        self.last_layer = None
        self.layering(activation)

        # Optimization Method
        self.optimizer = optimizers[optimizer](lr=learning_rate)
    
    def weight_initialization(self, initializer):
        params_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
        initializer_obj = initializers[initializer](self.params, 
                                                    params_size_list, 
                                                    self.use_batch_normalization)
        initializer_obj.initialize_params();
        
    def layering(self, activation):
        for idx in range(1, self.hidden_layer_num + 1):
            self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], self.params['b' + str(idx)])
            if self.use_batch_normalization:
                self.layers['Batch_Normalization' + str(idx)] = BatchNormalization(self.params['gamma' + str(idx)], 
                                                                                   self.params['beta' + str(idx)])
            self.layers['Activation' + str(idx)] = activation_layers[activation]()

        idx = self.hidden_layer_num + 1
        self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], self.params['b' + str(idx)])

        self.last_layer = SoftmaxWithCrossEntropyLoss()  

    def predict(self, x, is_train=False):
        for key, layer in self.layers.items():
            if "BatchNorm" in key:
                x = layer.forward(x, is_train)
            else:
                x = layer.forward(x)
        return x

    def loss(self, x, t, is_train=False):
        y = self.predict(x, is_train)
        return self.last_layer.forward(y, t)

    def accuracy(self, x, t):
        y = self.predict(x, is_train=False)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy        

    def backpropagation_gradient(self, x, t):
        # forward
        self.loss(x, t, is_train=True)

        # backward
        din = 1
        din = self.last_layer.backward(din)

        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            din = layer.backward(din)

        grads = {}
        for idx in range(1, self.hidden_layer_num + 2):
            grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW
            grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db

            if self.use_batch_normalization and idx <= self.hidden_layer_num:
                grads['gamma' + str(idx)] = self.layers['Batch_Normalization' + str(idx)].dgamma
                grads['beta' + str(idx)] = self.layers['Batch_Normalization' + str(idx)].dbeta
                
        return grads

    def learning(self, x_batch, t_batch):
        grads = self.backpropagation_gradient(x_batch, t_batch)
        self.optimizer.update(self.params, grads)

Training and Evaluation

In [6]:
data = mnist_data("/Users/yhhan/git/aiclass/0.Professor/data/MNIST_data/.")
(img_train, label_train), (img_validation, label_validation), (img_test, label_test) = data.load_mnist(flatten=True, normalize=True, one_hot_label=True)

input_size=784
hidden_layer1_size=128
hidden_layer2_size=128
output_size=10

num_epochs = 50
train_size = img_train.shape[0]
batch_size = 1000
learning_rate = 0.1

markers = {"N2, SGD, No_Batch_Norm": "x", "N2, SGD, Batch_Norm": "o", 
           "N2, AdaGrad, No_Batch_Norm": "+", "N2, AdaGrad, Batch_Norm": "*",
           "He, AdaGrad, No_Batch_Norm": "h", "He, AdaGrad, Batch_Norm": "H"}

networks = {}
train_errors = {}
validation_errors = {}
test_accuracy_values = {}
max_test_accuracy_epoch = {}
max_test_accuracy_value = {}

for key in markers.keys():
    if key == "N2, SGD, No_Batch_Norm":
        networks[key] = MultiLayerNetExtended(input_size, [hidden_layer1_size, hidden_layer2_size], output_size, 
                                activation='ReLU', 
                                initializer='N2',
                                optimizer='SGD', learning_rate=learning_rate,
                                use_batch_normalization=False)
    elif key == "N2, SGD, Batch_Norm":
        networks[key] = MultiLayerNetExtended(input_size, [hidden_layer1_size, hidden_layer2_size], output_size, 
                                activation='ReLU', 
                                initializer='N2',
                                optimizer='SGD', learning_rate=learning_rate,
                                use_batch_normalization=True)
    elif key == "N2, AdaGrad, No_Batch_Norm":
        networks[key] = MultiLayerNetExtended(input_size, [hidden_layer1_size, hidden_layer2_size], output_size, 
                                activation='ReLU', 
                                initializer='N2',
                                optimizer='AdaGrad', learning_rate=learning_rate,
                                use_batch_normalization=False)
    elif key == "N2, AdaGrad, Batch_Norm":
        networks[key] = MultiLayerNetExtended(input_size, [hidden_layer1_size, hidden_layer2_size], output_size, 
                                activation='ReLU', 
                                initializer='N2',
                                optimizer='AdaGrad', learning_rate=learning_rate,
                                use_batch_normalization=True)
    elif key == "He, AdaGrad, No_Batch_Norm":
        networks[key] = MultiLayerNetExtended(input_size, [hidden_layer1_size, hidden_layer2_size], output_size, 
                                activation='ReLU', 
                                initializer='He',
                                optimizer='AdaGrad', learning_rate=learning_rate,
                                use_batch_normalization=False)
    elif key == "He, AdaGrad, Batch_Norm":
        networks[key] = MultiLayerNetExtended(input_size, [hidden_layer1_size, hidden_layer2_size], output_size, 
                                activation='ReLU', 
                                initializer='He',
                                optimizer='AdaGrad', learning_rate=learning_rate,
                                use_batch_normalization=True)
        
    train_errors[key] = [] 
    validation_errors[key] = []
    test_accuracy_values[key] = []
    max_test_accuracy_epoch[key] = 0
    max_test_accuracy_value[key] = 0.0
In [7]:
epoch_list = []

num_batch = math.ceil(train_size / batch_size)

for i in range(num_epochs):
    epoch_list.append(i)
    for key in markers.keys():
        for k in range(num_batch):
            x_batch = img_train[k * batch_size : k * batch_size + batch_size]
            t_batch = label_train[k * batch_size : k * batch_size + batch_size]
            networks[key].learning(x_batch, t_batch)

        train_loss = networks[key].loss(x_batch, t_batch, is_train=True)
        train_errors[key].append(train_loss)

        validation_loss = networks[key].loss(img_validation, label_validation, is_train=False)
        validation_errors[key].append(validation_loss)    

        test_accuracy = networks[key].accuracy(img_test, label_test)
        test_accuracy_values[key].append(test_accuracy)
        if test_accuracy > max_test_accuracy_value[key]:
            max_test_accuracy_epoch[key] = i            
            max_test_accuracy_value[key] = test_accuracy
        print("{0:26s}-Epoch:{1:3d}, Train Err.:{2:7.5f}, Validation Err.:{3:7.5f}, Test Accuracy:{4:7.5f}, Max Test Accuracy:{5:7.5f}".format(
            key,
            i,
            train_loss,
            validation_loss,
            test_accuracy,
            max_test_accuracy_value[key]
        ))
    print()    
N2, SGD, No_Batch_Norm    -Epoch:  0, Train Err.:2.30142, Validation Err.:2.30115, Test Accuracy:0.11350, Max Test Accuracy:0.11350
N2, SGD, Batch_Norm       -Epoch:  0, Train Err.:2.30076, Validation Err.:2.30081, Test Accuracy:0.11350, Max Test Accuracy:0.11350
N2, AdaGrad, No_Batch_Norm-Epoch:  0, Train Err.:0.28646, Validation Err.:0.27083, Test Accuracy:0.89810, Max Test Accuracy:0.89810
N2, AdaGrad, Batch_Norm   -Epoch:  0, Train Err.:0.08490, Validation Err.:0.12210, Test Accuracy:0.95600, Max Test Accuracy:0.95600
He, AdaGrad, No_Batch_Norm-Epoch:  0, Train Err.:0.27105, Validation Err.:0.23288, Test Accuracy:0.91000, Max Test Accuracy:0.91000
He, AdaGrad, Batch_Norm   -Epoch:  0, Train Err.:0.08500, Validation Err.:0.11982, Test Accuracy:0.95650, Max Test Accuracy:0.95650

N2, SGD, No_Batch_Norm    -Epoch:  1, Train Err.:2.29994, Validation Err.:2.29949, Test Accuracy:0.11350, Max Test Accuracy:0.11350
N2, SGD, Batch_Norm       -Epoch:  1, Train Err.:2.29402, Validation Err.:2.29373, Test Accuracy:0.11350, Max Test Accuracy:0.11350
N2, AdaGrad, No_Batch_Norm-Epoch:  1, Train Err.:0.17610, Validation Err.:0.17474, Test Accuracy:0.93160, Max Test Accuracy:0.93160
N2, AdaGrad, Batch_Norm   -Epoch:  1, Train Err.:0.04542, Validation Err.:0.09554, Test Accuracy:0.96700, Max Test Accuracy:0.96700
He, AdaGrad, No_Batch_Norm-Epoch:  1, Train Err.:0.18877, Validation Err.:0.17099, Test Accuracy:0.93500, Max Test Accuracy:0.93500
He, AdaGrad, Batch_Norm   -Epoch:  1, Train Err.:0.04356, Validation Err.:0.09434, Test Accuracy:0.96700, Max Test Accuracy:0.96700

N2, SGD, No_Batch_Norm    -Epoch:  2, Train Err.:2.29598, Validation Err.:2.29525, Test Accuracy:0.11350, Max Test Accuracy:0.11350
N2, SGD, Batch_Norm       -Epoch:  2, Train Err.:2.21592, Validation Err.:2.21335, Test Accuracy:0.20590, Max Test Accuracy:0.20590
N2, AdaGrad, No_Batch_Norm-Epoch:  2, Train Err.:0.12487, Validation Err.:0.14111, Test Accuracy:0.94520, Max Test Accuracy:0.94520
N2, AdaGrad, Batch_Norm   -Epoch:  2, Train Err.:0.02827, Validation Err.:0.08908, Test Accuracy:0.97040, Max Test Accuracy:0.97040
He, AdaGrad, No_Batch_Norm-Epoch:  2, Train Err.:0.14593, Validation Err.:0.15055, Test Accuracy:0.94480, Max Test Accuracy:0.94480
He, AdaGrad, Batch_Norm   -Epoch:  2, Train Err.:0.03047, Validation Err.:0.08662, Test Accuracy:0.97020, Max Test Accuracy:0.97020

N2, SGD, No_Batch_Norm    -Epoch:  3, Train Err.:2.27636, Validation Err.:2.27430, Test Accuracy:0.18860, Max Test Accuracy:0.18860
N2, SGD, Batch_Norm       -Epoch:  3, Train Err.:1.87459, Validation Err.:1.86765, Test Accuracy:0.51140, Max Test Accuracy:0.51140
N2, AdaGrad, No_Batch_Norm-Epoch:  3, Train Err.:0.09501, Validation Err.:0.12811, Test Accuracy:0.95270, Max Test Accuracy:0.95270
N2, AdaGrad, Batch_Norm   -Epoch:  3, Train Err.:0.01708, Validation Err.:0.08797, Test Accuracy:0.97150, Max Test Accuracy:0.97150
He, AdaGrad, No_Batch_Norm-Epoch:  3, Train Err.:0.12191, Validation Err.:0.13801, Test Accuracy:0.94970, Max Test Accuracy:0.94970
He, AdaGrad, Batch_Norm   -Epoch:  3, Train Err.:0.02223, Validation Err.:0.08496, Test Accuracy:0.97220, Max Test Accuracy:0.97220

N2, SGD, No_Batch_Norm    -Epoch:  4, Train Err.:2.03846, Validation Err.:2.02219, Test Accuracy:0.34520, Max Test Accuracy:0.34520
N2, SGD, Batch_Norm       -Epoch:  4, Train Err.:1.31604, Validation Err.:1.31001, Test Accuracy:0.62020, Max Test Accuracy:0.62020
N2, AdaGrad, No_Batch_Norm-Epoch:  4, Train Err.:0.07719, Validation Err.:0.12186, Test Accuracy:0.95640, Max Test Accuracy:0.95640
N2, AdaGrad, Batch_Norm   -Epoch:  4, Train Err.:0.01082, Validation Err.:0.08955, Test Accuracy:0.97380, Max Test Accuracy:0.97380
He, AdaGrad, No_Batch_Norm-Epoch:  4, Train Err.:0.10159, Validation Err.:0.13027, Test Accuracy:0.95340, Max Test Accuracy:0.95340
He, AdaGrad, Batch_Norm   -Epoch:  4, Train Err.:0.01767, Validation Err.:0.08480, Test Accuracy:0.97360, Max Test Accuracy:0.97360

N2, SGD, No_Batch_Norm    -Epoch:  5, Train Err.:1.32011, Validation Err.:1.28427, Test Accuracy:0.58910, Max Test Accuracy:0.58910
N2, SGD, Batch_Norm       -Epoch:  5, Train Err.:0.64360, Validation Err.:0.64611, Test Accuracy:0.85730, Max Test Accuracy:0.85730
N2, AdaGrad, No_Batch_Norm-Epoch:  5, Train Err.:0.06636, Validation Err.:0.11686, Test Accuracy:0.95740, Max Test Accuracy:0.95740
N2, AdaGrad, Batch_Norm   -Epoch:  5, Train Err.:0.00727, Validation Err.:0.09021, Test Accuracy:0.97430, Max Test Accuracy:0.97430
He, AdaGrad, No_Batch_Norm-Epoch:  5, Train Err.:0.08680, Validation Err.:0.12379, Test Accuracy:0.95610, Max Test Accuracy:0.95610
He, AdaGrad, Batch_Norm   -Epoch:  5, Train Err.:0.01242, Validation Err.:0.08559, Test Accuracy:0.97340, Max Test Accuracy:0.97360

N2, SGD, No_Batch_Norm    -Epoch:  6, Train Err.:0.79624, Validation Err.:0.73915, Test Accuracy:0.74060, Max Test Accuracy:0.74060
N2, SGD, Batch_Norm       -Epoch:  6, Train Err.:0.30023, Validation Err.:0.31701, Test Accuracy:0.93570, Max Test Accuracy:0.93570
N2, AdaGrad, No_Batch_Norm-Epoch:  6, Train Err.:0.05990, Validation Err.:0.11310, Test Accuracy:0.95900, Max Test Accuracy:0.95900
N2, AdaGrad, Batch_Norm   -Epoch:  6, Train Err.:0.00510, Validation Err.:0.09130, Test Accuracy:0.97610, Max Test Accuracy:0.97610
He, AdaGrad, No_Batch_Norm-Epoch:  6, Train Err.:0.07553, Validation Err.:0.12046, Test Accuracy:0.95790, Max Test Accuracy:0.95790
He, AdaGrad, Batch_Norm   -Epoch:  6, Train Err.:0.00794, Validation Err.:0.08634, Test Accuracy:0.97520, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch:  7, Train Err.:0.65980, Validation Err.:0.59393, Test Accuracy:0.79580, Max Test Accuracy:0.79580
N2, SGD, Batch_Norm       -Epoch:  7, Train Err.:0.16340, Validation Err.:0.19568, Test Accuracy:0.95150, Max Test Accuracy:0.95150
N2, AdaGrad, No_Batch_Norm-Epoch:  7, Train Err.:0.05385, Validation Err.:0.11145, Test Accuracy:0.96110, Max Test Accuracy:0.96110
N2, AdaGrad, Batch_Norm   -Epoch:  7, Train Err.:0.00390, Validation Err.:0.09530, Test Accuracy:0.97630, Max Test Accuracy:0.97630
He, AdaGrad, No_Batch_Norm-Epoch:  7, Train Err.:0.06787, Validation Err.:0.11779, Test Accuracy:0.96050, Max Test Accuracy:0.96050
He, AdaGrad, Batch_Norm   -Epoch:  7, Train Err.:0.00552, Validation Err.:0.08728, Test Accuracy:0.97390, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch:  8, Train Err.:0.58878, Validation Err.:0.52060, Test Accuracy:0.82420, Max Test Accuracy:0.82420
N2, SGD, Batch_Norm       -Epoch:  8, Train Err.:0.10522, Validation Err.:0.14610, Test Accuracy:0.96080, Max Test Accuracy:0.96080
N2, AdaGrad, No_Batch_Norm-Epoch:  8, Train Err.:0.04869, Validation Err.:0.11205, Test Accuracy:0.96180, Max Test Accuracy:0.96180
N2, AdaGrad, Batch_Norm   -Epoch:  8, Train Err.:0.00304, Validation Err.:0.09796, Test Accuracy:0.97650, Max Test Accuracy:0.97650
He, AdaGrad, No_Batch_Norm-Epoch:  8, Train Err.:0.06219, Validation Err.:0.11598, Test Accuracy:0.96170, Max Test Accuracy:0.96170
He, AdaGrad, Batch_Norm   -Epoch:  8, Train Err.:0.00421, Validation Err.:0.09080, Test Accuracy:0.97420, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch:  9, Train Err.:0.52825, Validation Err.:0.45882, Test Accuracy:0.84660, Max Test Accuracy:0.84660
N2, SGD, Batch_Norm       -Epoch:  9, Train Err.:0.07354, Validation Err.:0.11918, Test Accuracy:0.96720, Max Test Accuracy:0.96720
N2, AdaGrad, No_Batch_Norm-Epoch:  9, Train Err.:0.04488, Validation Err.:0.11235, Test Accuracy:0.96220, Max Test Accuracy:0.96220
N2, AdaGrad, Batch_Norm   -Epoch:  9, Train Err.:0.00243, Validation Err.:0.10307, Test Accuracy:0.97700, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch:  9, Train Err.:0.05715, Validation Err.:0.11423, Test Accuracy:0.96310, Max Test Accuracy:0.96310
He, AdaGrad, Batch_Norm   -Epoch:  9, Train Err.:0.00329, Validation Err.:0.09269, Test Accuracy:0.97360, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 10, Train Err.:0.48160, Validation Err.:0.40801, Test Accuracy:0.86270, Max Test Accuracy:0.86270
N2, SGD, Batch_Norm       -Epoch: 10, Train Err.:0.05488, Validation Err.:0.10457, Test Accuracy:0.97090, Max Test Accuracy:0.97090
N2, AdaGrad, No_Batch_Norm-Epoch: 10, Train Err.:0.04085, Validation Err.:0.11234, Test Accuracy:0.96230, Max Test Accuracy:0.96230
N2, AdaGrad, Batch_Norm   -Epoch: 10, Train Err.:0.00194, Validation Err.:0.10662, Test Accuracy:0.97630, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 10, Train Err.:0.05259, Validation Err.:0.11215, Test Accuracy:0.96380, Max Test Accuracy:0.96380
He, AdaGrad, Batch_Norm   -Epoch: 10, Train Err.:0.00259, Validation Err.:0.09566, Test Accuracy:0.97320, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 11, Train Err.:0.45130, Validation Err.:0.37351, Test Accuracy:0.87320, Max Test Accuracy:0.87320
N2, SGD, Batch_Norm       -Epoch: 11, Train Err.:0.04303, Validation Err.:0.09581, Test Accuracy:0.97340, Max Test Accuracy:0.97340
N2, AdaGrad, No_Batch_Norm-Epoch: 11, Train Err.:0.03811, Validation Err.:0.11280, Test Accuracy:0.96310, Max Test Accuracy:0.96310
N2, AdaGrad, Batch_Norm   -Epoch: 11, Train Err.:0.00160, Validation Err.:0.11009, Test Accuracy:0.97640, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 11, Train Err.:0.04696, Validation Err.:0.11148, Test Accuracy:0.96440, Max Test Accuracy:0.96440
He, AdaGrad, Batch_Norm   -Epoch: 11, Train Err.:0.00219, Validation Err.:0.09775, Test Accuracy:0.97370, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 12, Train Err.:0.42699, Validation Err.:0.34824, Test Accuracy:0.88190, Max Test Accuracy:0.88190
N2, SGD, Batch_Norm       -Epoch: 12, Train Err.:0.03519, Validation Err.:0.09106, Test Accuracy:0.97490, Max Test Accuracy:0.97490
N2, AdaGrad, No_Batch_Norm-Epoch: 12, Train Err.:0.03566, Validation Err.:0.11386, Test Accuracy:0.96370, Max Test Accuracy:0.96370
N2, AdaGrad, Batch_Norm   -Epoch: 12, Train Err.:0.00138, Validation Err.:0.11260, Test Accuracy:0.97650, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 12, Train Err.:0.04388, Validation Err.:0.11102, Test Accuracy:0.96450, Max Test Accuracy:0.96450
He, AdaGrad, Batch_Norm   -Epoch: 12, Train Err.:0.00181, Validation Err.:0.09970, Test Accuracy:0.97310, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 13, Train Err.:0.40504, Validation Err.:0.32743, Test Accuracy:0.88800, Max Test Accuracy:0.88800
N2, SGD, Batch_Norm       -Epoch: 13, Train Err.:0.02928, Validation Err.:0.08753, Test Accuracy:0.97480, Max Test Accuracy:0.97490
N2, AdaGrad, No_Batch_Norm-Epoch: 13, Train Err.:0.03294, Validation Err.:0.11450, Test Accuracy:0.96330, Max Test Accuracy:0.96370
N2, AdaGrad, Batch_Norm   -Epoch: 13, Train Err.:0.00116, Validation Err.:0.11458, Test Accuracy:0.97650, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 13, Train Err.:0.04062, Validation Err.:0.11041, Test Accuracy:0.96520, Max Test Accuracy:0.96520
He, AdaGrad, Batch_Norm   -Epoch: 13, Train Err.:0.00156, Validation Err.:0.10114, Test Accuracy:0.97330, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 14, Train Err.:0.38497, Validation Err.:0.30956, Test Accuracy:0.89390, Max Test Accuracy:0.89390
N2, SGD, Batch_Norm       -Epoch: 14, Train Err.:0.02486, Validation Err.:0.08599, Test Accuracy:0.97410, Max Test Accuracy:0.97490
N2, AdaGrad, No_Batch_Norm-Epoch: 14, Train Err.:0.03046, Validation Err.:0.11549, Test Accuracy:0.96350, Max Test Accuracy:0.96370
N2, AdaGrad, Batch_Norm   -Epoch: 14, Train Err.:0.00101, Validation Err.:0.11607, Test Accuracy:0.97650, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 14, Train Err.:0.03738, Validation Err.:0.11023, Test Accuracy:0.96510, Max Test Accuracy:0.96520
He, AdaGrad, Batch_Norm   -Epoch: 14, Train Err.:0.00133, Validation Err.:0.10274, Test Accuracy:0.97350, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 15, Train Err.:0.36626, Validation Err.:0.29415, Test Accuracy:0.89830, Max Test Accuracy:0.89830
N2, SGD, Batch_Norm       -Epoch: 15, Train Err.:0.02116, Validation Err.:0.08525, Test Accuracy:0.97500, Max Test Accuracy:0.97500
N2, AdaGrad, No_Batch_Norm-Epoch: 15, Train Err.:0.02821, Validation Err.:0.11692, Test Accuracy:0.96360, Max Test Accuracy:0.96370
N2, AdaGrad, Batch_Norm   -Epoch: 15, Train Err.:0.00088, Validation Err.:0.11764, Test Accuracy:0.97650, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 15, Train Err.:0.03517, Validation Err.:0.11059, Test Accuracy:0.96570, Max Test Accuracy:0.96570
He, AdaGrad, Batch_Norm   -Epoch: 15, Train Err.:0.00119, Validation Err.:0.10467, Test Accuracy:0.97360, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 16, Train Err.:0.34959, Validation Err.:0.28041, Test Accuracy:0.90390, Max Test Accuracy:0.90390
N2, SGD, Batch_Norm       -Epoch: 16, Train Err.:0.01766, Validation Err.:0.08557, Test Accuracy:0.97530, Max Test Accuracy:0.97530
N2, AdaGrad, No_Batch_Norm-Epoch: 16, Train Err.:0.02642, Validation Err.:0.11830, Test Accuracy:0.96330, Max Test Accuracy:0.96370
N2, AdaGrad, Batch_Norm   -Epoch: 16, Train Err.:0.00079, Validation Err.:0.11851, Test Accuracy:0.97680, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 16, Train Err.:0.03254, Validation Err.:0.11065, Test Accuracy:0.96600, Max Test Accuracy:0.96600
He, AdaGrad, Batch_Norm   -Epoch: 16, Train Err.:0.00106, Validation Err.:0.10594, Test Accuracy:0.97400, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 17, Train Err.:0.33412, Validation Err.:0.26818, Test Accuracy:0.90800, Max Test Accuracy:0.90800
N2, SGD, Batch_Norm       -Epoch: 17, Train Err.:0.01482, Validation Err.:0.08499, Test Accuracy:0.97570, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 17, Train Err.:0.02445, Validation Err.:0.12019, Test Accuracy:0.96290, Max Test Accuracy:0.96370
N2, AdaGrad, Batch_Norm   -Epoch: 17, Train Err.:0.00071, Validation Err.:0.11948, Test Accuracy:0.97670, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 17, Train Err.:0.03080, Validation Err.:0.11160, Test Accuracy:0.96690, Max Test Accuracy:0.96690
He, AdaGrad, Batch_Norm   -Epoch: 17, Train Err.:0.00095, Validation Err.:0.10721, Test Accuracy:0.97420, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 18, Train Err.:0.31952, Validation Err.:0.25706, Test Accuracy:0.91130, Max Test Accuracy:0.91130
N2, SGD, Batch_Norm       -Epoch: 18, Train Err.:0.01276, Validation Err.:0.08456, Test Accuracy:0.97540, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 18, Train Err.:0.02259, Validation Err.:0.12186, Test Accuracy:0.96410, Max Test Accuracy:0.96410
N2, AdaGrad, Batch_Norm   -Epoch: 18, Train Err.:0.00064, Validation Err.:0.12058, Test Accuracy:0.97690, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 18, Train Err.:0.02863, Validation Err.:0.11167, Test Accuracy:0.96660, Max Test Accuracy:0.96690
He, AdaGrad, Batch_Norm   -Epoch: 18, Train Err.:0.00086, Validation Err.:0.10881, Test Accuracy:0.97430, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 19, Train Err.:0.30613, Validation Err.:0.24693, Test Accuracy:0.91380, Max Test Accuracy:0.91380
N2, SGD, Batch_Norm       -Epoch: 19, Train Err.:0.01115, Validation Err.:0.08410, Test Accuracy:0.97510, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 19, Train Err.:0.02127, Validation Err.:0.12316, Test Accuracy:0.96420, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 19, Train Err.:0.00059, Validation Err.:0.12152, Test Accuracy:0.97690, Max Test Accuracy:0.97700
He, AdaGrad, No_Batch_Norm-Epoch: 19, Train Err.:0.02658, Validation Err.:0.11335, Test Accuracy:0.96670, Max Test Accuracy:0.96690
He, AdaGrad, Batch_Norm   -Epoch: 19, Train Err.:0.00078, Validation Err.:0.11002, Test Accuracy:0.97430, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 20, Train Err.:0.29372, Validation Err.:0.23764, Test Accuracy:0.91650, Max Test Accuracy:0.91650
N2, SGD, Batch_Norm       -Epoch: 20, Train Err.:0.00985, Validation Err.:0.08445, Test Accuracy:0.97440, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 20, Train Err.:0.01970, Validation Err.:0.12503, Test Accuracy:0.96420, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 20, Train Err.:0.00054, Validation Err.:0.12274, Test Accuracy:0.97710, Max Test Accuracy:0.97710
He, AdaGrad, No_Batch_Norm-Epoch: 20, Train Err.:0.02503, Validation Err.:0.11407, Test Accuracy:0.96750, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 20, Train Err.:0.00072, Validation Err.:0.11122, Test Accuracy:0.97440, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 21, Train Err.:0.28205, Validation Err.:0.22896, Test Accuracy:0.91890, Max Test Accuracy:0.91890
N2, SGD, Batch_Norm       -Epoch: 21, Train Err.:0.00871, Validation Err.:0.08360, Test Accuracy:0.97460, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 21, Train Err.:0.01834, Validation Err.:0.12802, Test Accuracy:0.96420, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 21, Train Err.:0.00050, Validation Err.:0.12379, Test Accuracy:0.97710, Max Test Accuracy:0.97710
He, AdaGrad, No_Batch_Norm-Epoch: 21, Train Err.:0.02354, Validation Err.:0.11514, Test Accuracy:0.96730, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 21, Train Err.:0.00067, Validation Err.:0.11254, Test Accuracy:0.97450, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 22, Train Err.:0.27117, Validation Err.:0.22087, Test Accuracy:0.92180, Max Test Accuracy:0.92180
N2, SGD, Batch_Norm       -Epoch: 22, Train Err.:0.00777, Validation Err.:0.08401, Test Accuracy:0.97420, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 22, Train Err.:0.01741, Validation Err.:0.12887, Test Accuracy:0.96420, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 22, Train Err.:0.00047, Validation Err.:0.12481, Test Accuracy:0.97710, Max Test Accuracy:0.97710
He, AdaGrad, No_Batch_Norm-Epoch: 22, Train Err.:0.02249, Validation Err.:0.11574, Test Accuracy:0.96690, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 22, Train Err.:0.00062, Validation Err.:0.11362, Test Accuracy:0.97440, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 23, Train Err.:0.26075, Validation Err.:0.21323, Test Accuracy:0.92460, Max Test Accuracy:0.92460
N2, SGD, Batch_Norm       -Epoch: 23, Train Err.:0.00715, Validation Err.:0.08335, Test Accuracy:0.97360, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 23, Train Err.:0.01611, Validation Err.:0.13067, Test Accuracy:0.96370, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 23, Train Err.:0.00044, Validation Err.:0.12586, Test Accuracy:0.97720, Max Test Accuracy:0.97720
He, AdaGrad, No_Batch_Norm-Epoch: 23, Train Err.:0.02146, Validation Err.:0.11648, Test Accuracy:0.96660, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 23, Train Err.:0.00058, Validation Err.:0.11470, Test Accuracy:0.97430, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 24, Train Err.:0.25069, Validation Err.:0.20605, Test Accuracy:0.92760, Max Test Accuracy:0.92760
N2, SGD, Batch_Norm       -Epoch: 24, Train Err.:0.00650, Validation Err.:0.08365, Test Accuracy:0.97350, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 24, Train Err.:0.01508, Validation Err.:0.13254, Test Accuracy:0.96400, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 24, Train Err.:0.00041, Validation Err.:0.12683, Test Accuracy:0.97750, Max Test Accuracy:0.97750
He, AdaGrad, No_Batch_Norm-Epoch: 24, Train Err.:0.02042, Validation Err.:0.11790, Test Accuracy:0.96750, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 24, Train Err.:0.00054, Validation Err.:0.11563, Test Accuracy:0.97440, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 25, Train Err.:0.24084, Validation Err.:0.19927, Test Accuracy:0.93030, Max Test Accuracy:0.93030
N2, SGD, Batch_Norm       -Epoch: 25, Train Err.:0.00607, Validation Err.:0.08327, Test Accuracy:0.97410, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 25, Train Err.:0.01414, Validation Err.:0.13492, Test Accuracy:0.96410, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 25, Train Err.:0.00038, Validation Err.:0.12764, Test Accuracy:0.97770, Max Test Accuracy:0.97770
He, AdaGrad, No_Batch_Norm-Epoch: 25, Train Err.:0.01964, Validation Err.:0.11808, Test Accuracy:0.96710, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 25, Train Err.:0.00050, Validation Err.:0.11657, Test Accuracy:0.97440, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 26, Train Err.:0.23130, Validation Err.:0.19266, Test Accuracy:0.93240, Max Test Accuracy:0.93240
N2, SGD, Batch_Norm       -Epoch: 26, Train Err.:0.00560, Validation Err.:0.08301, Test Accuracy:0.97380, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 26, Train Err.:0.01322, Validation Err.:0.13632, Test Accuracy:0.96400, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 26, Train Err.:0.00036, Validation Err.:0.12859, Test Accuracy:0.97780, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 26, Train Err.:0.01873, Validation Err.:0.11930, Test Accuracy:0.96710, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 26, Train Err.:0.00047, Validation Err.:0.11756, Test Accuracy:0.97430, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 27, Train Err.:0.22194, Validation Err.:0.18645, Test Accuracy:0.93430, Max Test Accuracy:0.93430
N2, SGD, Batch_Norm       -Epoch: 27, Train Err.:0.00517, Validation Err.:0.08312, Test Accuracy:0.97460, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 27, Train Err.:0.01216, Validation Err.:0.13738, Test Accuracy:0.96360, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 27, Train Err.:0.00034, Validation Err.:0.12942, Test Accuracy:0.97770, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 27, Train Err.:0.01786, Validation Err.:0.11987, Test Accuracy:0.96740, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 27, Train Err.:0.00045, Validation Err.:0.11840, Test Accuracy:0.97430, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 28, Train Err.:0.21302, Validation Err.:0.18058, Test Accuracy:0.93670, Max Test Accuracy:0.93670
N2, SGD, Batch_Norm       -Epoch: 28, Train Err.:0.00480, Validation Err.:0.08302, Test Accuracy:0.97460, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 28, Train Err.:0.01178, Validation Err.:0.13984, Test Accuracy:0.96370, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 28, Train Err.:0.00033, Validation Err.:0.13013, Test Accuracy:0.97770, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 28, Train Err.:0.01713, Validation Err.:0.12092, Test Accuracy:0.96730, Max Test Accuracy:0.96750
He, AdaGrad, Batch_Norm   -Epoch: 28, Train Err.:0.00042, Validation Err.:0.11916, Test Accuracy:0.97430, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 29, Train Err.:0.20453, Validation Err.:0.17506, Test Accuracy:0.93910, Max Test Accuracy:0.93910
N2, SGD, Batch_Norm       -Epoch: 29, Train Err.:0.00449, Validation Err.:0.08289, Test Accuracy:0.97490, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 29, Train Err.:0.01116, Validation Err.:0.14196, Test Accuracy:0.96370, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 29, Train Err.:0.00031, Validation Err.:0.13093, Test Accuracy:0.97770, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 29, Train Err.:0.01650, Validation Err.:0.12167, Test Accuracy:0.96760, Max Test Accuracy:0.96760
He, AdaGrad, Batch_Norm   -Epoch: 29, Train Err.:0.00040, Validation Err.:0.12010, Test Accuracy:0.97410, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 30, Train Err.:0.19651, Validation Err.:0.16975, Test Accuracy:0.94060, Max Test Accuracy:0.94060
N2, SGD, Batch_Norm       -Epoch: 30, Train Err.:0.00422, Validation Err.:0.08285, Test Accuracy:0.97410, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 30, Train Err.:0.01041, Validation Err.:0.14353, Test Accuracy:0.96360, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 30, Train Err.:0.00029, Validation Err.:0.13171, Test Accuracy:0.97760, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 30, Train Err.:0.01577, Validation Err.:0.12294, Test Accuracy:0.96740, Max Test Accuracy:0.96760
He, AdaGrad, Batch_Norm   -Epoch: 30, Train Err.:0.00038, Validation Err.:0.12076, Test Accuracy:0.97410, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 31, Train Err.:0.18891, Validation Err.:0.16480, Test Accuracy:0.94220, Max Test Accuracy:0.94220
N2, SGD, Batch_Norm       -Epoch: 31, Train Err.:0.00395, Validation Err.:0.08283, Test Accuracy:0.97450, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 31, Train Err.:0.00996, Validation Err.:0.14605, Test Accuracy:0.96310, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 31, Train Err.:0.00028, Validation Err.:0.13242, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 31, Train Err.:0.01522, Validation Err.:0.12392, Test Accuracy:0.96800, Max Test Accuracy:0.96800
He, AdaGrad, Batch_Norm   -Epoch: 31, Train Err.:0.00036, Validation Err.:0.12149, Test Accuracy:0.97420, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 32, Train Err.:0.18163, Validation Err.:0.16016, Test Accuracy:0.94330, Max Test Accuracy:0.94330
N2, SGD, Batch_Norm       -Epoch: 32, Train Err.:0.00372, Validation Err.:0.08270, Test Accuracy:0.97410, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 32, Train Err.:0.00953, Validation Err.:0.14784, Test Accuracy:0.96320, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 32, Train Err.:0.00027, Validation Err.:0.13304, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 32, Train Err.:0.01442, Validation Err.:0.12540, Test Accuracy:0.96790, Max Test Accuracy:0.96800
He, AdaGrad, Batch_Norm   -Epoch: 32, Train Err.:0.00034, Validation Err.:0.12223, Test Accuracy:0.97420, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 33, Train Err.:0.17469, Validation Err.:0.15575, Test Accuracy:0.94530, Max Test Accuracy:0.94530
N2, SGD, Batch_Norm       -Epoch: 33, Train Err.:0.00353, Validation Err.:0.08281, Test Accuracy:0.97470, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 33, Train Err.:0.00903, Validation Err.:0.14978, Test Accuracy:0.96350, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 33, Train Err.:0.00026, Validation Err.:0.13374, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 33, Train Err.:0.01374, Validation Err.:0.12632, Test Accuracy:0.96790, Max Test Accuracy:0.96800
He, AdaGrad, Batch_Norm   -Epoch: 33, Train Err.:0.00033, Validation Err.:0.12285, Test Accuracy:0.97410, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 34, Train Err.:0.16821, Validation Err.:0.15166, Test Accuracy:0.94630, Max Test Accuracy:0.94630
N2, SGD, Batch_Norm       -Epoch: 34, Train Err.:0.00333, Validation Err.:0.08228, Test Accuracy:0.97450, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 34, Train Err.:0.00887, Validation Err.:0.15215, Test Accuracy:0.96290, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 34, Train Err.:0.00025, Validation Err.:0.13437, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 34, Train Err.:0.01326, Validation Err.:0.12789, Test Accuracy:0.96780, Max Test Accuracy:0.96800
He, AdaGrad, Batch_Norm   -Epoch: 34, Train Err.:0.00031, Validation Err.:0.12351, Test Accuracy:0.97390, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 35, Train Err.:0.16203, Validation Err.:0.14782, Test Accuracy:0.94840, Max Test Accuracy:0.94840
N2, SGD, Batch_Norm       -Epoch: 35, Train Err.:0.00318, Validation Err.:0.08267, Test Accuracy:0.97410, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 35, Train Err.:0.00842, Validation Err.:0.15306, Test Accuracy:0.96310, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 35, Train Err.:0.00024, Validation Err.:0.13498, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 35, Train Err.:0.01279, Validation Err.:0.12788, Test Accuracy:0.96810, Max Test Accuracy:0.96810
He, AdaGrad, Batch_Norm   -Epoch: 35, Train Err.:0.00030, Validation Err.:0.12411, Test Accuracy:0.97400, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 36, Train Err.:0.15635, Validation Err.:0.14423, Test Accuracy:0.94980, Max Test Accuracy:0.94980
N2, SGD, Batch_Norm       -Epoch: 36, Train Err.:0.00302, Validation Err.:0.08239, Test Accuracy:0.97430, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 36, Train Err.:0.00799, Validation Err.:0.15626, Test Accuracy:0.96290, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 36, Train Err.:0.00023, Validation Err.:0.13557, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 36, Train Err.:0.01226, Validation Err.:0.12988, Test Accuracy:0.96810, Max Test Accuracy:0.96810
He, AdaGrad, Batch_Norm   -Epoch: 36, Train Err.:0.00029, Validation Err.:0.12464, Test Accuracy:0.97400, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 37, Train Err.:0.15101, Validation Err.:0.14092, Test Accuracy:0.95040, Max Test Accuracy:0.95040
N2, SGD, Batch_Norm       -Epoch: 37, Train Err.:0.00289, Validation Err.:0.08262, Test Accuracy:0.97420, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 37, Train Err.:0.00743, Validation Err.:0.15788, Test Accuracy:0.96260, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 37, Train Err.:0.00022, Validation Err.:0.13611, Test Accuracy:0.97750, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 37, Train Err.:0.01169, Validation Err.:0.13048, Test Accuracy:0.96840, Max Test Accuracy:0.96840
He, AdaGrad, Batch_Norm   -Epoch: 37, Train Err.:0.00028, Validation Err.:0.12523, Test Accuracy:0.97390, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 38, Train Err.:0.14598, Validation Err.:0.13778, Test Accuracy:0.95150, Max Test Accuracy:0.95150
N2, SGD, Batch_Norm       -Epoch: 38, Train Err.:0.00277, Validation Err.:0.08238, Test Accuracy:0.97370, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 38, Train Err.:0.00720, Validation Err.:0.15925, Test Accuracy:0.96270, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 38, Train Err.:0.00021, Validation Err.:0.13673, Test Accuracy:0.97750, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 38, Train Err.:0.01135, Validation Err.:0.13158, Test Accuracy:0.96840, Max Test Accuracy:0.96840
He, AdaGrad, Batch_Norm   -Epoch: 38, Train Err.:0.00027, Validation Err.:0.12585, Test Accuracy:0.97370, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 39, Train Err.:0.14118, Validation Err.:0.13490, Test Accuracy:0.95270, Max Test Accuracy:0.95270
N2, SGD, Batch_Norm       -Epoch: 39, Train Err.:0.00265, Validation Err.:0.08244, Test Accuracy:0.97370, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 39, Train Err.:0.00693, Validation Err.:0.16186, Test Accuracy:0.96280, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 39, Train Err.:0.00020, Validation Err.:0.13723, Test Accuracy:0.97750, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 39, Train Err.:0.01104, Validation Err.:0.13206, Test Accuracy:0.96860, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 39, Train Err.:0.00026, Validation Err.:0.12636, Test Accuracy:0.97370, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 40, Train Err.:0.13669, Validation Err.:0.13224, Test Accuracy:0.95330, Max Test Accuracy:0.95330
N2, SGD, Batch_Norm       -Epoch: 40, Train Err.:0.00255, Validation Err.:0.08238, Test Accuracy:0.97390, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 40, Train Err.:0.00658, Validation Err.:0.16311, Test Accuracy:0.96210, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 40, Train Err.:0.00020, Validation Err.:0.13771, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 40, Train Err.:0.01053, Validation Err.:0.13296, Test Accuracy:0.96850, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 40, Train Err.:0.00025, Validation Err.:0.12694, Test Accuracy:0.97380, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 41, Train Err.:0.13246, Validation Err.:0.12978, Test Accuracy:0.95420, Max Test Accuracy:0.95420
N2, SGD, Batch_Norm       -Epoch: 41, Train Err.:0.00244, Validation Err.:0.08242, Test Accuracy:0.97380, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 41, Train Err.:0.00622, Validation Err.:0.16490, Test Accuracy:0.96220, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 41, Train Err.:0.00019, Validation Err.:0.13821, Test Accuracy:0.97730, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 41, Train Err.:0.01007, Validation Err.:0.13419, Test Accuracy:0.96850, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 41, Train Err.:0.00024, Validation Err.:0.12742, Test Accuracy:0.97380, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 42, Train Err.:0.12850, Validation Err.:0.12750, Test Accuracy:0.95460, Max Test Accuracy:0.95460
N2, SGD, Batch_Norm       -Epoch: 42, Train Err.:0.00235, Validation Err.:0.08221, Test Accuracy:0.97360, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 42, Train Err.:0.00600, Validation Err.:0.16643, Test Accuracy:0.96220, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 42, Train Err.:0.00018, Validation Err.:0.13869, Test Accuracy:0.97720, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 42, Train Err.:0.00978, Validation Err.:0.13493, Test Accuracy:0.96780, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 42, Train Err.:0.00023, Validation Err.:0.12790, Test Accuracy:0.97370, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 43, Train Err.:0.12470, Validation Err.:0.12529, Test Accuracy:0.95600, Max Test Accuracy:0.95600
N2, SGD, Batch_Norm       -Epoch: 43, Train Err.:0.00226, Validation Err.:0.08232, Test Accuracy:0.97420, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 43, Train Err.:0.00578, Validation Err.:0.16789, Test Accuracy:0.96210, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 43, Train Err.:0.00018, Validation Err.:0.13918, Test Accuracy:0.97720, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 43, Train Err.:0.00951, Validation Err.:0.13580, Test Accuracy:0.96780, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 43, Train Err.:0.00022, Validation Err.:0.12843, Test Accuracy:0.97380, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 44, Train Err.:0.12109, Validation Err.:0.12324, Test Accuracy:0.95690, Max Test Accuracy:0.95690
N2, SGD, Batch_Norm       -Epoch: 44, Train Err.:0.00218, Validation Err.:0.08235, Test Accuracy:0.97380, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 44, Train Err.:0.00543, Validation Err.:0.16925, Test Accuracy:0.96200, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 44, Train Err.:0.00017, Validation Err.:0.13959, Test Accuracy:0.97720, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 44, Train Err.:0.00915, Validation Err.:0.13653, Test Accuracy:0.96810, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 44, Train Err.:0.00022, Validation Err.:0.12888, Test Accuracy:0.97370, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 45, Train Err.:0.11773, Validation Err.:0.12128, Test Accuracy:0.95760, Max Test Accuracy:0.95760
N2, SGD, Batch_Norm       -Epoch: 45, Train Err.:0.00210, Validation Err.:0.08241, Test Accuracy:0.97420, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 45, Train Err.:0.00521, Validation Err.:0.17106, Test Accuracy:0.96210, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 45, Train Err.:0.00017, Validation Err.:0.14005, Test Accuracy:0.97720, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 45, Train Err.:0.00883, Validation Err.:0.13793, Test Accuracy:0.96800, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 45, Train Err.:0.00021, Validation Err.:0.12942, Test Accuracy:0.97370, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 46, Train Err.:0.11454, Validation Err.:0.11936, Test Accuracy:0.95780, Max Test Accuracy:0.95780
N2, SGD, Batch_Norm       -Epoch: 46, Train Err.:0.00203, Validation Err.:0.08213, Test Accuracy:0.97390, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 46, Train Err.:0.00501, Validation Err.:0.17291, Test Accuracy:0.96200, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 46, Train Err.:0.00016, Validation Err.:0.14048, Test Accuracy:0.97720, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 46, Train Err.:0.00837, Validation Err.:0.13798, Test Accuracy:0.96770, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 46, Train Err.:0.00020, Validation Err.:0.12979, Test Accuracy:0.97390, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 47, Train Err.:0.11148, Validation Err.:0.11759, Test Accuracy:0.95860, Max Test Accuracy:0.95860
N2, SGD, Batch_Norm       -Epoch: 47, Train Err.:0.00198, Validation Err.:0.08242, Test Accuracy:0.97420, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 47, Train Err.:0.00488, Validation Err.:0.17379, Test Accuracy:0.96250, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 47, Train Err.:0.00016, Validation Err.:0.14091, Test Accuracy:0.97730, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 47, Train Err.:0.00815, Validation Err.:0.13917, Test Accuracy:0.96810, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 47, Train Err.:0.00020, Validation Err.:0.13024, Test Accuracy:0.97380, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 48, Train Err.:0.10846, Validation Err.:0.11591, Test Accuracy:0.95970, Max Test Accuracy:0.95970
N2, SGD, Batch_Norm       -Epoch: 48, Train Err.:0.00190, Validation Err.:0.08237, Test Accuracy:0.97400, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 48, Train Err.:0.00465, Validation Err.:0.17527, Test Accuracy:0.96200, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 48, Train Err.:0.00015, Validation Err.:0.14129, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 48, Train Err.:0.00782, Validation Err.:0.14013, Test Accuracy:0.96790, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 48, Train Err.:0.00019, Validation Err.:0.13071, Test Accuracy:0.97390, Max Test Accuracy:0.97520

N2, SGD, No_Batch_Norm    -Epoch: 49, Train Err.:0.10557, Validation Err.:0.11436, Test Accuracy:0.96020, Max Test Accuracy:0.96020
N2, SGD, Batch_Norm       -Epoch: 49, Train Err.:0.00185, Validation Err.:0.08243, Test Accuracy:0.97410, Max Test Accuracy:0.97570
N2, AdaGrad, No_Batch_Norm-Epoch: 49, Train Err.:0.00454, Validation Err.:0.17855, Test Accuracy:0.96170, Max Test Accuracy:0.96420
N2, AdaGrad, Batch_Norm   -Epoch: 49, Train Err.:0.00015, Validation Err.:0.14174, Test Accuracy:0.97740, Max Test Accuracy:0.97780
He, AdaGrad, No_Batch_Norm-Epoch: 49, Train Err.:0.00757, Validation Err.:0.14114, Test Accuracy:0.96780, Max Test Accuracy:0.96860
He, AdaGrad, Batch_Norm   -Epoch: 49, Train Err.:0.00019, Validation Err.:0.13112, Test Accuracy:0.97380, Max Test Accuracy:0.97520

In [11]:
f, axarr = plt.subplots(2, 2, figsize=(20, 12))
for key in markers.keys():
    axarr[0, 0].plot(epoch_list[1:], train_errors[key][1:], marker=markers[key], markevery=2, label=key)
axarr[0, 0].set_ylabel('Train - Total Error')
axarr[0, 0].set_xlabel('Epochs')
axarr[0, 0].grid(True)
axarr[0, 0].set_title('Train Error')
axarr[0, 0].legend(loc='upper right')

for key in markers.keys():
    axarr[0, 1].plot(epoch_list[1:], validation_errors[key][1:], marker=markers[key], markevery=2, label=key)
axarr[0, 1].set_ylabel('Validation - Total Error')
axarr[0, 1].set_xlabel('Epochs')
axarr[0, 1].grid(True)
axarr[0, 1].set_title('Validation Error')
axarr[0, 1].legend(loc='upper right')

for key in markers.keys():
    axarr[1, 0].plot(epoch_list[1:], train_errors[key][1:], marker=markers[key], markevery=2, label=key)
axarr[1, 0].set_ylabel('Train - Total Error')
axarr[1, 0].set_xlabel('Epochs')
axarr[1, 0].grid(True)
axarr[1, 0].set_ylim(0, 0.2)
axarr[1, 0].set_title('Train Error (0.00 ~ 3.00)')
axarr[1, 0].legend(loc='upper right')

for key in markers.keys():
    axarr[1, 1].plot(epoch_list[1:], validation_errors[key][1:], marker=markers[key], markevery=2, label=key)
axarr[1, 1].set_ylabel('Validation - Total Error')
axarr[1, 1].set_xlabel('Epochs')
axarr[1, 1].grid(True)
axarr[1, 1].set_ylim(0, 0.2)
axarr[1, 1].set_title('Validation Error (0.00 ~ 1.00)')
axarr[1, 1].legend(loc='upper right')

f.subplots_adjust(hspace=0.3)

plt.show()
In [17]:
f, axarr = plt.subplots(2, 1, figsize=(15,10))
for key in markers.keys():
    axarr[0].plot(epoch_list[1:], test_accuracy_values[key][1:], marker=markers[key], markevery=1, label=key)
axarr[0].set_ylabel('Test Accuracy')
axarr[0].set_xlabel('Epochs')
axarr[0].grid(True)
axarr[0].set_title('Test Accuracy')
axarr[0].legend(loc='lower right')

for key in markers.keys():
    axarr[1].plot(epoch_list[1:], test_accuracy_values[key][1:], marker=markers[key], markevery=1, label=key)
axarr[1].set_ylabel('Test Accuracy')
axarr[1].set_xlabel('Epochs')
axarr[1].grid(True)
axarr[1].set_ylim(0.94, 0.99)
axarr[1].set_title('Test Accuracy (0.7 ~ 1.0)')
axarr[1].legend(loc='lower right')

f.subplots_adjust(hspace=0.3)
plt.show()
In [16]:
for key in markers.keys():
    print("{0:26s} - Epoch:{1:3d}, Max Test Accuracy: {2:7.5f}".format(key, max_test_accuracy_epoch[key], max_test_accuracy_value[key]))
N2, SGD, No_Batch_Norm     - Epoch: 49, Max Test Accuracy: 0.96020
N2, SGD, Batch_Norm        - Epoch: 17, Max Test Accuracy: 0.97570
N2, AdaGrad, No_Batch_Norm - Epoch: 19, Max Test Accuracy: 0.96420
N2, AdaGrad, Batch_Norm    - Epoch: 26, Max Test Accuracy: 0.97780
He, AdaGrad, No_Batch_Norm - Epoch: 39, Max Test Accuracy: 0.96860
He, AdaGrad, Batch_Norm    - Epoch:  6, Max Test Accuracy: 0.97520