#!/usr/bin/env python # coding: utf-8 # # Initializer Comparison with Sigmoid Activation # In[1]: # coding: utf-8 import sys, os import numpy as np import matplotlib.pyplot as plt import math from collections import OrderedDict # In[2]: class Affine: def __init__(self, W, b): self.W = W self.b = b self.x = None self.dW = None self.db = None def forward(self, x): self.x = x out = np.dot(self.x, self.W) + self.b return out def backward(self, din): dx = np.dot(din, self.W.T) self.dW = np.dot(self.x.T, din) self.db = np.sum(din, axis=0) return dx # In[3]: class Sigmoid: def __init__(self): self.out = None def forward(self, x): out = 1 / (1 + np.exp(-x)) self.out = out return out def backward(self, din): dx = din * self.out * (1.0 - self.out) return dx class Relu: def __init__(self): self.mask = None def forward(self, x): self.mask = (x <= 0) out = x.copy() out[self.mask] = 0 return out def backward(self, din): din[self.mask] = 0 dx = din return dx # ## Five layers and Activation Values # In[4]: def activation_value_histogram(layers, num_epoch): x = np.random.randn(1000, 100) # 1000 x 100 y = np.ones((1000, 100)) num_layers = len(layers) activations = OrderedDict() backward_activations = OrderedDict() avg_forward_std = [] avg_backward_std = [] for epoch in range(num_epoch): print("Epoch:", epoch) # Forward for i in range(num_layers): if i != 0: x = activations[i-1] activations[i] = layers[i].forward(x) # Backward for i in reversed(range(num_layers)): if i != num_layers - 1: y = backward_activations[i+1] backward_activations[i] = layers[i].backward(y) # Parameter Update for i in range(num_layers): if i % 2 == 0: layers[i].W = layers[i].W - 0.01 * layers[i].dW layers[i].b = layers[i].b - 0.01 * layers[i].db # Histograms of Forward Activation Values print("Forward Activations") forward_std_sum = 0.0 f, axarr = plt.subplots(1, num_layers, figsize=(25, 5)) for i, a in activations.items(): values = a.flatten() std = "{:5.4f}".format(np.std(values)) forward_std_sum += float(std) axarr[i].hist(values, 20) if i % 2 == 0: axarr[i].set_title(str(i+1) + "-Layer ($\sigma=$" + std + ")") else: axarr[i].set_title("Sigmoid Layer ($\sigma=$" + std + ")") avg_forward_std.append(forward_std_sum / num_layers) f.subplots_adjust(wspace=0.5) plt.show() # Histograms of Backward Activation Values print("Backward Activations") backward_std_sum = 0.0 f, axarr = plt.subplots(1, num_layers, figsize=(25, 5)) for i, b in backward_activations.items(): values2 = b.flatten() std = "{:5.4f}".format(np.std(values2)) backward_std_sum += float(std) axarr[i].hist(values2, 20) if i % 2 == 0: axarr[i].set_title(str(i+1) + "-Layer ($\sigma=$" + std + ")") else: axarr[i].set_title("Sigmoid Layer ($\sigma=$" + std + ")") avg_backward_std.append(backward_std_sum / num_layers) f.subplots_adjust(wspace=0.5) plt.show() return avg_forward_std, avg_backward_std # ## 1. Weight and Bias Variation with Sigmoid Activation # In[5]: markers = {"Zero": "o", "N(0.0, 1.0)": "x", "N(0.0, 0.0001)": "s", "Xavier": "o", "He": "x"} avg_forward_std_dic = OrderedDict() avg_backward_std_dic = OrderedDict() # ### 1) Only Zero Values # - Weight Distribution: $W = 0$ # - Bias Distribution: $B = 0$ # In[14]: node_num = 100 #각 은닉층의 노드 수 num_layers = 6 # 3개의 은닉층 + 3개의 활성화층 num_epochs = 5 layers = OrderedDict() for i in range(num_layers): if i % 2 == 0: w = np.zeros((node_num, node_num)) b = np.zeros(node_num) layers[i] = Affine(w, b) else: layers[i] = Sigmoid() avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs) avg_forward_std_dic['Zero'] = avg_forward_std avg_backward_std_dic['Zero'] = avg_backward_std print(avg_forward_std) print(avg_backward_std) # ### 2) Standard Normal Distribution # - Weight Distribution: $W \sim N(0, 1)$ # - Bias Distribution: $B \sim N(0, 1)$ # In[7]: node_num = 100 #각 은닉층의 노드 수 num_layers = 6 # 3개의 은닉층 + 3개의 활성화층 num_epochs = 5 layers = OrderedDict() for i in range(num_layers): if i % 2 == 0: w = np.random.randn(node_num, node_num) b = np.random.randn(node_num) layers[i] = Affine(w, b) else: layers[i] = Sigmoid() avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs) avg_forward_std_dic['N(0.0, 1.0)'] = avg_forward_std avg_backward_std_dic['N(0.0, 1.0)'] = avg_backward_std print(avg_forward_std) print(avg_backward_std) # ### 3) Standard Normal Distribution where Standard Deviation is 0.01 # - Weight Distribution: $W \sim N(0, 0.0001)$ # - Bias Distribution: $B \sim N(0, 0.0001)$ # In[8]: node_num = 100 #각 은닉층의 노드 수 num_layers = 6 # 3개의 은닉층 + 3개의 활성화층 num_epochs = 5 layers = OrderedDict() for i in range(num_layers): if i % 2 == 0: w = np.random.randn(node_num, node_num) * 0.01 b = np.random.randn(node_num) * 0.01 layers[i] = Affine(w, b) else: layers[i] = Sigmoid() avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs) avg_forward_std_dic['N(0.0, 0.0001)'] = avg_forward_std avg_backward_std_dic['N(0.0, 0.0001)'] = avg_backward_std print(avg_forward_std) print(avg_backward_std) # ### 4) Xavier Initializer where Standard Deviation is $1 \over \sqrt{n}$ ($n$ is the number of neurons in the previous layer) # - Weight Distribution: $W \sim N(0, {1 \over n})$ # - Bias Distribution: $B \sim N(0, {1 \over n})$ # In[9]: node_num = 100 #각 은닉층의 노드 수 num_layers = 6 # 3개의 은닉층 + 3개의 활성화층 num_epochs = 5 layers = OrderedDict() for i in range(num_layers): if i % 2 == 0: w = np.random.randn(node_num, node_num) / np.sqrt(node_num) b = np.random.randn(node_num) / np.sqrt(node_num) layers[i] = Affine(w, b) else: layers[i] = Sigmoid() avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs) avg_forward_std_dic['Xavier'] = avg_forward_std avg_backward_std_dic['Xavier'] = avg_backward_std print(avg_forward_std) print(avg_backward_std) # ### 5) He Initializer where Standard Deviation is $\sqrt{2 \over n}$ ($n$ is the number of neurons in the previous layer) # - Weight Distribution: $W \sim N(0, {2 \over n})$ # - Bias Distribution: $B \sim N(0, {2 \over n})$ # In[10]: node_num = 100 #각 은닉층의 노드 수 num_layers = 6 # 3개의 은닉층 + 3개의 활성화층 num_epochs = 5 layers = OrderedDict() for i in range(num_layers): if i % 2 == 0: w = np.random.randn(node_num, node_num) * np.sqrt(2) / np.sqrt(node_num) b = np.random.randn(node_num) * np.sqrt(2) / np.sqrt(node_num) layers[i] = Affine(w, b) else: layers[i] = Sigmoid() avg_forward_std, avg_backward_std = activation_value_histogram(layers, num_epochs) avg_forward_std_dic['He'] = avg_forward_std avg_backward_std_dic['He'] = avg_backward_std print(avg_forward_std) print(avg_backward_std) # ### Standard Deviation of Forward Activation Values per Epoch (0, 1, 2, 3, 4) # In[12]: import pandas as pd df = pd.DataFrame(avg_forward_std_dic) df # ### Standard Deviation of Backward Activation Values per Epoch (0, 1, 2, 3, 4) # In[13]: df = pd.DataFrame(avg_backward_std_dic) df