# Initializer Comparison with Sigmoid Activation¶

In [1]:
# coding: utf-8
import sys, os
import numpy as np
import matplotlib.pyplot as plt
import math
from collections import OrderedDict

In [2]:
class Affine:
def __init__(self, W, b):
self.W = W
self.b = b
self.x = None
self.dW = None
self.db = None

def forward(self, x):
self.x = x
out = np.dot(self.x, self.W) + self.b
return out

def backward(self, din):
dx = np.dot(din, self.W.T)
self.dW = np.dot(self.x.T, din)
self.db = np.sum(din, axis=0)
return dx

In [3]:
class Sigmoid:
def __init__(self):
self.out = None

def forward(self, x):
out = 1 / (1 + np.exp(-x))
self.out = out
return out

def backward(self, din):
dx = din * self.out * (1.0 - self.out)
return dx

class Relu:
def __init__(self):

def forward(self, x):
out = x.copy()
return out

def backward(self, din):
dx = din
return dx


## Five layers and Activation Values¶

In [4]:
def activation_value_histogram(layers, num_epoch):
x = np.random.randn(1000, 100) # 1000 x 100
y = np.ones((1000, 100))

num_layers = len(layers)
activations = OrderedDict()
backward_activations = OrderedDict()

avg_forward_std = []
avg_backward_std = []

for epoch in range(num_epoch):
print("Epoch:", epoch)
# Forward
for i in range(num_layers):
if i != 0:
x = activations[i-1]
activations[i] = layers[i].forward(x)

# Backward
for i in reversed(range(num_layers)):
if i != num_layers - 1:
y = backward_activations[i+1]
backward_activations[i] = layers[i].backward(y)

# Parameter Update
for i in range(num_layers):
if i % 2 == 0:
layers[i].W = layers[i].W - 0.01 * layers[i].dW
layers[i].b = layers[i].b - 0.01 * layers[i].db

# Histograms of Forward Activation Values
print("Forward Activations")
forward_std_sum = 0.0
f, axarr = plt.subplots(1, num_layers, figsize=(25, 5))
for i, a in activations.items():
values = a.flatten()
std = "{:5.4f}".format(np.std(values))
forward_std_sum += float(std)
axarr[i].hist(values, 20)
if i % 2 == 0:
axarr[i].set_title(str(i+1) + "-Layer ($\sigma=$" + std + ")")
else:
axarr[i].set_title("Sigmoid Layer ($\sigma=$" + std + ")")
avg_forward_std.append(forward_std_sum / num_layers)
plt.show()

# Histograms of Backward Activation Values
print("Backward Activations")
backward_std_sum = 0.0
f, axarr = plt.subplots(1, num_layers, figsize=(25, 5))
for i, b in backward_activations.items():
values2 = b.flatten()
std = "{:5.4f}".format(np.std(values2))
backward_std_sum += float(std)
axarr[i].hist(values2, 20)
if i % 2 == 0:
axarr[i].set_title(str(i+1) + "-Layer ($\sigma=$" + std + ")")
else:
axarr[i].set_title("Sigmoid Layer ($\sigma=$" + std + ")")
avg_backward_std.append(backward_std_sum / num_layers)
plt.show()
return avg_forward_std, avg_backward_std


## 1. Weight and Bias Variation with Sigmoid Activation¶

In [5]:
markers = {"Zero": "o", "N(0.0, 1.0)": "x", "N(0.0, 0.0001)": "s", "Xavier": "o", "He": "x"}

avg_forward_std_dic = OrderedDict()
avg_backward_std_dic = OrderedDict()


### 1) Only Zero Values¶

• Weight Distribution: $W = 0$
• Bias Distribution: $B = 0$
In [14]:
node_num = 100 #각 은닉층의 노드 수
num_layers = 6 # 3개의 은닉층 + 3개의 활성화층
num_epochs = 5

layers = OrderedDict()

for i in range(num_layers):
if i % 2 == 0:
w = np.zeros((node_num, node_num))
b = np.zeros(node_num)
layers[i] = Affine(w, b)
else:
layers[i] = Sigmoid()

avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs)
avg_forward_std_dic['Zero'] = avg_forward_std
avg_backward_std_dic['Zero'] = avg_backward_std
print(avg_forward_std)
print(avg_backward_std)

Epoch: 0
Forward Activations

Backward Activations

Epoch: 1
Forward Activations

Backward Activations

Epoch: 2
Forward Activations

Backward Activations

Epoch: 3
Forward Activations

Backward Activations

Epoch: 4
Forward Activations

Backward Activations

[0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0]


### 2) Standard Normal Distribution¶

• Weight Distribution: $W \sim N(0, 1)$
• Bias Distribution: $B \sim N(0, 1)$
In [7]:
node_num = 100 #각 은닉층의 노드 수
num_layers = 6 # 3개의 은닉층 + 3개의 활성화층
num_epochs = 5

layers = OrderedDict()

for i in range(num_layers):
if i % 2 == 0:
w = np.random.randn(node_num, node_num)
b = np.random.randn(node_num)
layers[i] = Affine(w, b)
else:
layers[i] = Sigmoid()

avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs)
avg_forward_std_dic['N(0.0, 1.0)'] = avg_forward_std
avg_backward_std_dic['N(0.0, 1.0)'] = avg_backward_std
print(avg_forward_std)
print(avg_backward_std)

Epoch: 0
Forward Activations

Backward Activations

Epoch: 1
Forward Activations

Backward Activations

Epoch: 2
Forward Activations

/Users/yhhan/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:6: RuntimeWarning: overflow encountered in exp

Backward Activations

Epoch: 3
Forward Activations

Backward Activations

Epoch: 4
Forward Activations

Backward Activations

[4.139066666666667, 15.408683333333334, 48.39193333333332, 48.426566666666666, 48.89881666666667]
[0.5041666666666667, 0.00935, 5e-05, 0.0, 0.0]


### 3) Standard Normal Distribution where Standard Deviation is 0.01¶

• Weight Distribution: $W \sim N(0, 0.0001)$
• Bias Distribution: $B \sim N(0, 0.0001)$
In [8]:
node_num = 100 #각 은닉층의 노드 수
num_layers = 6 # 3개의 은닉층 + 3개의 활성화층
num_epochs = 5

layers = OrderedDict()

for i in range(num_layers):
if i % 2 == 0:
w = np.random.randn(node_num, node_num) * 0.01
b = np.random.randn(node_num) * 0.01
layers[i] = Affine(w, b)
else:
layers[i] = Sigmoid()

avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs)
avg_forward_std_dic['N(0.0, 0.0001)'] = avg_forward_std
avg_backward_std_dic['N(0.0, 0.0001)'] = avg_backward_std
print(avg_forward_std)
print(avg_backward_std)

Epoch: 0
Forward Activations

Backward Activations

Epoch: 1
Forward Activations

Backward Activations

Epoch: 2
Forward Activations

Backward Activations

Epoch: 3
Forward Activations

Backward Activations

Epoch: 4
Forward Activations

Backward Activations

[0.04065, 0.32061666666666666, 1.4111833333333335, 1.4125499999999998, 1.4125499999999998]
[0.005216666666666667, 0.0, 0.0, 0.0, 0.0]


### 4) Xavier Initializer where Standard Deviation is $1 \over \sqrt{n}$ ($n$ is the number of neurons in the previous layer)¶

• Weight Distribution: $W \sim N(0, {1 \over n})$
• Bias Distribution: $B \sim N(0, {1 \over n})$
In [9]:
node_num = 100 #각 은닉층의 노드 수
num_layers = 6 # 3개의 은닉층 + 3개의 활성화층
num_epochs = 5

layers = OrderedDict()

for i in range(num_layers):
if i % 2 == 0:
w = np.random.randn(node_num, node_num) / np.sqrt(node_num)
b = np.random.randn(node_num) / np.sqrt(node_num)
layers[i] = Affine(w, b)
else:
layers[i] = Sigmoid()

avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs)
avg_forward_std_dic['Xavier'] = avg_forward_std
avg_backward_std_dic['Xavier'] = avg_backward_std
print(avg_forward_std)
print(avg_backward_std)

Epoch: 0
Forward Activations

Backward Activations

Epoch: 1
Forward Activations

Backward Activations

Epoch: 2
Forward Activations

Backward Activations

Epoch: 3
Forward Activations

Backward Activations

Epoch: 4
Forward Activations

Backward Activations

[0.42616666666666664, 2.6696833333333334, 10.6837, 10.699766666666667, 10.700316666666666]
[0.05201666666666666, 0.0, 0.0, 0.0, 0.0]


### 5) He Initializer where Standard Deviation is $\sqrt{2 \over n}$ ($n$ is the number of neurons in the previous layer)¶

• Weight Distribution: $W \sim N(0, {2 \over n})$
• Bias Distribution: $B \sim N(0, {2 \over n})$
In [10]:
node_num = 100 #각 은닉층의 노드 수
num_layers = 6 # 3개의 은닉층 + 3개의 활성화층
num_epochs = 5

layers = OrderedDict()

for i in range(num_layers):
if i % 2 == 0:
w = np.random.randn(node_num, node_num) * np.sqrt(2) / np.sqrt(node_num)
b = np.random.randn(node_num) * np.sqrt(2)  / np.sqrt(node_num)
layers[i] = Affine(w, b)
else:
layers[i] = Sigmoid()

avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs)
avg_forward_std_dic['He'] = avg_forward_std
avg_backward_std_dic['He'] = avg_backward_std
print(avg_forward_std)
print(avg_backward_std)

Epoch: 0
Forward Activations