# Backpropagation!¶

7 x 6 x 5 x 2 의 4계층 신경망 구조로 backpropagation과 numeric gradient, analytic gradient를 검증해본다.

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd

import sys
sys.path.append('/Users/kaonpark/workspace/github.com/likejazz/kaon-learn')
import kaonlearn
from kaonlearn.plots import plot_decision_regions, plot_history

In [2]:
def _gradient_check(analytic, numeric):
numerator = abs(analytic - numeric)
denominator = max(analytic, numeric)
if denominator == 0:
print ("Correct!")
else:
difference = numerator / denominator

# cs231n의 권장 수치는 1e-7이나 그 기준을 맞출 수가 없다.
if difference < 1e-7:
print ("Correct!")
else:
print("\x1b[31mWrong!\x1b[0m")

nn.__init__()
nn.train()

if l == 1:
w = nn.w_1
elif l == 2:
w = nn.w_2
elif l == 3:
w = nn.w_3

for k in range(w.shape[0]):
for j in range(w.shape[1]):
nn.__init__()
if l == 1:
nn.w_1[k][j] += nn.h
elif l == 2:
nn.w_2[k][j] += nn.h
elif l == 3:
nn.w_3[k][j] += nn.h
nn.query()
e1 = np.sum((nn.t - nn.out_o) ** 2) / 2

nn.__init__()
if l == 1:
nn.w_1[k][j] -= nn.h
elif l == 2:
nn.w_2[k][j] -= nn.h
elif l == 3:
nn.w_3[k][j] -= nn.h
nn.query()
e2 = np.sum((nn.t - nn.out_o) ** 2) / 2

if l == 1:
delta = nn.delta_w_1[k][j]
elif l == 2:
delta = nn.delta_w_2[k][j]
elif l == 3:
delta = nn.delta_w_3[k][j]

numeric_gradient = (e1 - e2) / (2 * nn.h)
print("%.16f, %.16f" % (delta, numeric_gradient), end=", ")

nn.__init__()
if l == 1:
nn.b_1[k] += nn.h
elif l == 2:
nn.b_2[k] += nn.h
elif l == 3:
nn.b_3[k] += nn.h
nn.query()
e1 = np.sum((nn.t - nn.out_o) ** 2) / 2

nn.__init__()
if l == 1:
nn.b_1[k] -= nn.h
elif l == 2:
nn.b_2[k] -= nn.h
elif l == 3:
nn.b_3[k] -= nn.h
nn.query()
e2 = np.sum((nn.t - nn.out_o) ** 2) / 2

print()
if l == 1:
delta = nn.delta_b_1[k]
elif l == 2:
delta = nn.delta_b_2[k]
elif l == 3:
delta = nn.delta_b_3[k]

numeric_gradient = (e1 - e2) / (2 * nn.h)
print("%.16f, %.16f" % (delta, numeric_gradient), end=", ")
print()

In [3]:
def sigmoid(z: np.ndarray):
return 1 / (1 + np.exp(-z))

def d_sigmoid(z: np.ndarray):
return sigmoid(z) * (1.0 - sigmoid(z))

def relu(z: np.ndarray):
return np.maximum(z, 0)

def d_relu(z: np.ndarray):
return z > 0

# --

def GD(self, delta, t, l):
return - self.lr * delta

beta1 = .9
beta2 = .999

eps = 1e-8

self.m[l] = beta1 * self.m[l] + (1. - beta1) * delta
self.v[l] = beta2 * self.v[l] + (1. - beta2) * delta**2

self.m_k_hat = self.m[l] / (1. - beta1**(t))
self.v_k_hat = self.v[l] / (1. - beta2**(t))

self.update_parameters = - (self.lr * self.m_k_hat / (np.sqrt(self.v_k_hat) + eps))
return self.update_parameters

def momentum(self, delta, t, l):
gamma = .9

self.m[l] = gamma * self.m[l] + self.lr * delta
return - self.m[l]

In [4]:
class NeuralNetwork:
def __init__(self):
self.i = np.array([0.4,-0.2,0.1,0.1,-0.15,0.6,-0.9]).reshape(-1, 1)

np.random.seed(12)
self.w_1 = np.random.rand(6, 7)
self.b_1 = np.random.rand(6).reshape(-1, 1)
self.w_2 = np.random.rand(5, 6)
self.b_2 = np.random.rand(5).reshape(-1, 1)
self.w_3 = np.random.rand(2, 5)
self.b_3 = np.random.rand(2).reshape(-1, 1)

self.t = np.array([[0.87503811],[0.83690408]])

self.lr = 0.1
self.h = 1e-4

# Optimizer Parameters
self.iter = 1
self.m = [
np.zeros(self.w_3.shape),
np.zeros(self.b_3.shape),
np.zeros(self.w_2.shape),
np.zeros(self.b_2.shape),
np.zeros(self.w_1.shape),
np.zeros(self.b_1.shape),
]
self.v = [
np.zeros(self.w_3.shape),
np.zeros(self.b_3.shape),
np.zeros(self.w_2.shape),
np.zeros(self.b_2.shape),
np.zeros(self.w_1.shape),
np.zeros(self.b_1.shape),
]

def _forward(self):
self.net_h1 = np.dot(self.w_1, self.i) + self.b_1
self.out_h1 = relu(self.net_h1)

self.net_h2 = np.dot(self.w_2, self.out_h1) + self.b_2
self.out_h2 = sigmoid(self.net_h2)

self.net_o  = np.dot(self.w_3, self.out_h2) + self.b_3
self.out_o  = sigmoid(self.net_o)

def _backward(self, optimizer):
d_o_errors = - (self.t - self.out_o)
self.delta_w_3 = np.dot(d_o_errors * d_sigmoid(self.net_o), self.out_h2.T)
self.w_3 += optimizer(self, self.delta_w_3, self.iter, 0)
self.delta_b_3 = d_o_errors * d_sigmoid(self.net_o)
self.b_3 += optimizer(self, self.delta_b_3, self.iter, 1)

d_h2_errors = np.dot(self.w_3.T, d_o_errors * d_sigmoid(self.net_o))
self.delta_w_2 = np.dot(d_h2_errors * d_sigmoid(self.net_h2), self.out_h1.T)
self.w_2 += optimizer(self, self.delta_w_2, self.iter, 2)
self.delta_b_2 = d_h2_errors * d_sigmoid(self.net_h2)
self.b_2 += optimizer(self, self.delta_b_2, self.iter, 3)

d_h1_errors = np.dot(self.w_2.T, d_h2_errors * d_sigmoid(self.net_h2))
self.delta_w_1 = np.dot(d_h1_errors * d_relu(self.net_h1), self.i.T)
self.w_1 += optimizer(self, self.delta_w_1, self.iter, 4)
self.delta_b_1 = d_h1_errors * d_relu(self.net_h1)
self.b_1 += optimizer(self, self.delta_b_1, self.iter, 5)

self.iter += 1

def train(self, optimizer = GD):
self._forward()
self._backward(optimizer)

def query(self):
self._forward()

def result(self):
print(self.t - self.out_o)

nn = NeuralNetwork()


출력 레이어에 activation(여기서는 sigmoid)이 없다면, 아래 처럼 최종 가중치 행렬의 delta 값과, 이전 가중치 행렬에 부여되는 에러값 계산이 다르다.

def _forward():
...
# 최종 출력 레이어에 activation(sigmoid)이 없다면,
out_o = net_o

def _backward():
...
# 최종 출력 레이어에 activation(sigmoid)이 없다면,
delta_w_3 = np.dot(d_o_errors, out_h2.T)
delta_b_3 = d_o_errors
...
# 이전 레이어의 에러에도 activation 미분이 생략된다.
d_h2_errors = np.dot(w_3.T, d_o_errors)

히든 레이어의 w1에 대한 delta_w1 수식은 아래와 같다.

$$\frac{\partial E_{total}}{\partial w_{1}} = (\sum\limits_{o}{\frac{\partial E_{total}}{\partial out_{o}} * \frac{\partial out_{o}}{\partial net_{o}} * \frac{\partial net_{o}}{\partial out_{h1}}}) * \frac{\partial out_{h1}}{\partial net_{h1}} * \frac{\partial net_{h1}}{\partial w_{1}}$$

$y_n$ 을 구하는 것이 역전파의 핵심이며 수식에서, $$\frac{\partial out_{o}}{\partial net_{o}} * \frac{\partial net_{o}}{\partial out_{h1}}$$ 부분이다. 즉, 출력 레이어의 activation 미분과 이전 가중치(w5, w6)를 곱한 값이 된다.

In [5]:
gradient_checking(nn, 3)

0.0019912952415730, 0.0019912952408351, Correct!
0.0024540703451672, 0.0024540703437870, Correct!
0.0026930630921895, 0.0026930630903522, Correct!
0.0027050489507301, 0.0027050489489859, Correct!
0.0027245918937070, 0.0027245918919703, Correct!

0.0033607543419824, 0.0033607543385456, Correct!

0.0031708367295230, 0.0031708367287473, Correct!
0.0039077361432061, 0.0039077361418081, Correct!
0.0042882959740775, 0.0042882959721928, Correct!
0.0043073816423912, 0.0043073816405163, Correct!
0.0043385008255743, 0.0043385008237015, Correct!

0.0053514933817869, 0.0053514933781237, Correct!


In [6]:
# 거의 비슷하여 정답으로 간주할 수 있으나 cs231n의 기준에는 미치지 못한다.

0.0000000000000000, 0.0000000000000000, Correct!
0.0004138630287177, 0.0004141685682173, Wrong!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0003945576204420, 0.0003948489074763, Wrong!
0.0000004462967620, 0.0000004466261735, Wrong!

0.0007738071862624, 0.0007743784586484, Wrong!

0.0000000000000000, 0.0000000000000000, Correct!
0.0004012666483439, 0.0004015738879013, Wrong!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0003825488215844, 0.0003828417294467, Wrong!
0.0000004327132250, 0.0000004330446678, Wrong!

0.0007502555061708, 0.0007508299572963, Wrong!

0.0000000000000000, 0.0000000000000000, Correct!
0.0007004977316710, 0.0007007702000454, Wrong!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0006678217162559, 0.0006680814749506, Wrong!
0.0000007553945333, 0.0000007556884632, Wrong!

0.0013097332719160, 0.0013102427110119, Wrong!

0.0000000000000000, 0.0000000000000000, Correct!
0.0003570468882901, 0.0003573168525731, Wrong!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0003403917742217, 0.0003406491453153, Wrong!
0.0000003850280384, 0.0000003853192244, Wrong!

0.0006675770214304, 0.0006680817784144, Wrong!

0.0000000000000000, 0.0000000000000000, Correct!
0.0005044746159777, 0.0005047403321950, Wrong!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0004809424622206, 0.0004811957836099, Wrong!
0.0000005440094234, 0.0000005442958757, Wrong!

0.0009432253089627, 0.0009437221237026, Wrong!


In [7]:
gradient_checking(nn, 1)

0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!

0.0000000000000000, 0.0000000000000000, Correct!

0.0009564518308797, 0.0009570341935952, Wrong!
-0.0004782259154398, -0.0004785170967716, Correct!
0.0002391129577199, 0.0002392585483554, Wrong!
0.0002391129577199, 0.0002392585483554, Wrong!
-0.0003586694365799, -0.0003588878226025, Correct!
0.0014346777463195, 0.0014355512905836, Wrong!
-0.0021520166194793, -0.0021533269360316, Correct!

0.0023911295771992, 0.0023925854846212, Wrong!

0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!

0.0000000000000000, 0.0000000000000000, Correct!

0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!
0.0000000000000000, 0.0000000000000000, Correct!

0.0000000000000000, 0.0000000000000000, Correct!

0.0008467909026301, 0.0008473850463073, Wrong!
-0.0004233954513151, -0.0004236925231146, Correct!
0.0002116977256575, 0.0002118462615140, Wrong!
0.0002116977256575, 0.0002118462615140, Wrong!
-0.0003175465884863, -0.0003177693923967, Correct!
0.0012701863539452, 0.0012710775693439, Wrong!
-0.0019052795309177, -0.0019066163540679, Correct!

0.0021169772565753, 0.0021184626157293, Wrong!

0.0006243229367469, 0.0006246487897418, Wrong!
-0.0003121614683734, -0.0003123243948622, Correct!
0.0001560807341867, 0.0001561621973314, Wrong!
0.0001560807341867, 0.0001561621973314, Wrong!
-0.0002341211012801, -0.0002342432961835, Correct!
0.0009364844051203, 0.0009369731846474, Wrong!
-0.0014047266076804, -0.0014054597771707, Correct!

0.0015608073418672, 0.0015616219747362, Wrong!


In [8]:
# Gradient Descent 학습
nn.__init__()

delta_w_1_history = []
w_1_history = []
for _ in range(7):
delta_w_1_history.append([])
w_1_history.append([])
delta_b_1_history = []
b_1_history = []

for _ in range(2000):
nn.train()

for j in range(7):
delta_w_1_history[j].append(nn.delta_w_1[1][j])
w_1_history[j].append(nn.w_1[1][j])
delta_b_1_history.append(nn.delta_b_1[1][0])
b_1_history.append(nn.b_1[1][0])

nn.query()
nn.result()

[[-0.00009784]
[ 0.00001233]]

In [9]:
# plot with various axes scales
plt.figure(1)

for j in range(7):
plt.subplot(221)
plt.plot(delta_w_1_history[j])
plt.title("delta_w_1_history")

plt.subplot(222)
plt.plot(w_1_history[j])
plt.title("w_1_history")

plt.subplot(223)
plt.plot(delta_b_1_history)
plt.title("delta_b_1_history")

plt.subplot(224)
plt.plot(b_1_history)
plt.title("b_1_history")

# Adjust the subplot layout, because the logit one may take more space
# than usual, due to y-tick labels like "1 - 10^{-3}"
# https://matplotlib.org/gallery/pyplots/pyplot_scales.html#sphx-glr-gallery-pyplots-pyplot-scales-py
plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
plt.show()


w_1의 기울기가 0에 이르는 지점이 bias 학습이 함께 진행될때는 1,200 epoch 정도이나,
bias 학습을 하지 않으면 1,500 epoch 이상을 넘어선다.

In [10]:
nn.w_1

Out[10]:
array([[ 0.15416284,  0.7400497 ,  0.26331502,  0.53373939,  0.01457496,
0.91874701,  0.90071485],
[-0.02574465,  0.98653238,  0.1224178 ,  0.26903683,  0.62827046,
0.85547602,  0.98585922],
[ 0.00225923,  0.52122603,  0.55203763,  0.48537741,  0.76813415,
0.16071675,  0.76456045],
[ 0.0208098 ,  0.13521018,  0.11627302,  0.30989758,  0.67145265,
0.47122978,  0.8161683 ],
[ 0.2380425 ,  0.75889812,  0.68973628,  0.3146834 ,  0.35397664,
0.90074165,  0.74055676],
[ 0.95018859,  0.76753812,  0.82497802,  0.40660907,  0.45135526,
0.40044423,  0.99541926]])
In [11]:
# Adam 학습
nn.__init__()

for _ in range(2000):

nn.query()
nn.result()

[[-0.]
[ 0.]]

In [12]:
nn.w_1

Out[12]:
array([[ 0.15416284,  0.7400497 ,  0.26331502,  0.53373939,  0.01457496,
0.91874701,  0.90071485],
[-0.81311735,  1.80346289, -0.70925378, -0.56263475,  1.45257992,
0.09767795,  1.69928833],
[ 0.00225923,  0.52122603,  0.55203763,  0.48537741,  0.76813415,
0.16071675,  0.76456045],
[ 0.0208098 ,  0.13521018,  0.11627302,  0.30989758,  0.67145265,
0.47122978,  0.8161683 ],
[-0.54559241,  1.5682712 , -0.13245494, -0.50750782,  1.16977011,
0.14286756,  1.45978018],
[ 0.3498036 ,  1.36794471,  0.22462189, -0.19374706,  1.05175024,
-0.19989192,  1.59567079]])
In [13]:
# Momentum 학습
nn.__init__()

momentum_w_1_history = []
momentum_b_1_history = []

for _ in range(2000):
nn.train(momentum)

momentum_w_1_history.append(nn.w_1[1][0])
momentum_b_1_history.append(nn.b_1[1][0])

nn.query()
nn.result()

[[0.]
[0.]]

In [14]:
nn.w_1

Out[14]:
array([[ 0.15416284,  0.7400497 ,  0.26331502,  0.53373939,  0.01457496,
0.91874701,  0.90071485],
[-0.02605567,  0.98668788,  0.12234005,  0.26895908,  0.6283871 ,
0.85500949,  0.98655901],
[ 0.00225923,  0.52122603,  0.55203763,  0.48537741,  0.76813415,
0.16071675,  0.76456045],
[ 0.0208098 ,  0.13521018,  0.11627302,  0.30989758,  0.67145265,
0.47122978,  0.8161683 ],
[ 0.23701052,  0.75941411,  0.68947829,  0.31442541,  0.35436363,
0.89919368,  0.74287871],
[ 0.94906447,  0.76810018,  0.82469699,  0.40632804,  0.45177681,
0.39875805,  0.99794853]])
In [15]:
plt.figure(1)

plt.subplot(221)
plt.plot(w_1_history[0])
plt.title("w_1_history")

plt.subplot(222)
plt.plot(b_1_history)
plt.title("b_1_history")

plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
plt.show()

# Adam의 학습 시간은 GD의 1/20 수준에 불과했다. 도달하는 값은 많이 다르다.

In [16]:
plt.figure(1)

plt.subplot(221)
plt.plot(momentum_w_1_history)
plt.plot(w_1_history[0])
plt.title("w_1_history")

plt.subplot(222)
plt.plot(momentum_b_1_history)
plt.plot(b_1_history)
plt.title("b_1_history")

plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
plt.show()

# Momentum은 GD와 거의 비슷한 값에 도달하는데, 학습 시간은 1/12 수준에 불과했다.

In [17]:
plt.figure(1)

plt.subplot(221)
plt.plot(momentum_w_1_history[:200])
plt.title("w_1_history")

plt.subplot(222)
plt.plot(momentum_b_1_history[:200])
plt.title("b_1_history")

plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35)
plt.show()


In [18]:
# Adam의 m, v 조회
nn.__init__()

update_parameters = []

for _ in range(200):

adam_m_1_history.append(nn.m[1][0]) # m_k_hat 기준과 동일하게 맞추려면 마지막 레이어인 5로 지정한다.
update_parameters.append(nn.update_parameters[5])

nn.query()
nn.result()

[[-0.00000018]
[-0.00000102]]

In [19]:
plt.figure(1)

plt.subplot(221)

plt.subplot(222)

plt.subplot(223)

plt.subplot(224)

plt.subplot(224)