#!/usr/bin/env python # coding: utf-8 # # Backpropagation! # # 7 x 6 x 5 x 2 의 4계층 신경망 구조로 backpropagation과 numeric gradient, analytic gradient를 검증해본다. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns import numpy as np np.set_printoptions(suppress=True) import pandas as pd import sys sys.path.append('/Users/kaonpark/workspace/github.com/likejazz/kaon-learn') import kaonlearn from kaonlearn.plots import plot_decision_regions, plot_history # In[2]: def _gradient_check(analytic, numeric): numerator = abs(analytic - numeric) denominator = max(analytic, numeric) if denominator == 0: print ("Correct!") else: difference = numerator / denominator # cs231n의 권장 수치는 1e-7이나 그 기준을 맞출 수가 없다. if difference < 1e-7: print ("Correct!") else: print("\x1b[31mWrong!\x1b[0m") def gradient_checking(nn, l = 3): nn.__init__() nn.train() if l == 1: w = nn.w_1 elif l == 2: w = nn.w_2 elif l == 3: w = nn.w_3 for k in range(w.shape[0]): for j in range(w.shape[1]): nn.__init__() if l == 1: nn.w_1[k][j] += nn.h elif l == 2: nn.w_2[k][j] += nn.h elif l == 3: nn.w_3[k][j] += nn.h nn.query() e1 = np.sum((nn.t - nn.out_o) ** 2) / 2 nn.__init__() if l == 1: nn.w_1[k][j] -= nn.h elif l == 2: nn.w_2[k][j] -= nn.h elif l == 3: nn.w_3[k][j] -= nn.h nn.query() e2 = np.sum((nn.t - nn.out_o) ** 2) / 2 if l == 1: delta = nn.delta_w_1[k][j] elif l == 2: delta = nn.delta_w_2[k][j] elif l == 3: delta = nn.delta_w_3[k][j] numeric_gradient = (e1 - e2) / (2 * nn.h) # 수치 미분(numeric gradient) 결과가 해석적 미분(analytic gradient)과 동일한지 검증 print("%.16f, %.16f" % (delta, numeric_gradient), end=", ") _gradient_check(delta, numeric_gradient) nn.__init__() if l == 1: nn.b_1[k] += nn.h elif l == 2: nn.b_2[k] += nn.h elif l == 3: nn.b_3[k] += nn.h nn.query() e1 = np.sum((nn.t - nn.out_o) ** 2) / 2 nn.__init__() if l == 1: nn.b_1[k] -= nn.h elif l == 2: nn.b_2[k] -= nn.h elif l == 3: nn.b_3[k] -= nn.h nn.query() e2 = np.sum((nn.t - nn.out_o) ** 2) / 2 print() if l == 1: delta = nn.delta_b_1[k] elif l == 2: delta = nn.delta_b_2[k] elif l == 3: delta = nn.delta_b_3[k] numeric_gradient = (e1 - e2) / (2 * nn.h) print("%.16f, %.16f" % (delta, numeric_gradient), end=", ") _gradient_check(delta, numeric_gradient) print() # In[3]: def sigmoid(z: np.ndarray): return 1 / (1 + np.exp(-z)) def d_sigmoid(z: np.ndarray): return sigmoid(z) * (1.0 - sigmoid(z)) def relu(z: np.ndarray): return np.maximum(z, 0) def d_relu(z: np.ndarray): return z > 0 # -- def GD(self, delta, t, l): return - self.lr * delta def adam(self, delta, t, l): beta1 = .9 beta2 = .999 eps = 1e-8 self.m[l] = beta1 * self.m[l] + (1. - beta1) * delta self.v[l] = beta2 * self.v[l] + (1. - beta2) * delta**2 self.m_k_hat = self.m[l] / (1. - beta1**(t)) self.v_k_hat = self.v[l] / (1. - beta2**(t)) self.update_parameters = - (self.lr * self.m_k_hat / (np.sqrt(self.v_k_hat) + eps)) return self.update_parameters def momentum(self, delta, t, l): gamma = .9 self.m[l] = gamma * self.m[l] + self.lr * delta return - self.m[l] # In[4]: class NeuralNetwork: def __init__(self): self.i = np.array([0.4,-0.2,0.1,0.1,-0.15,0.6,-0.9]).reshape(-1, 1) np.random.seed(12) self.w_1 = np.random.rand(6, 7) self.b_1 = np.random.rand(6).reshape(-1, 1) self.w_2 = np.random.rand(5, 6) self.b_2 = np.random.rand(5).reshape(-1, 1) self.w_3 = np.random.rand(2, 5) self.b_3 = np.random.rand(2).reshape(-1, 1) self.t = np.array([[0.87503811],[0.83690408]]) self.lr = 0.1 self.h = 1e-4 # Optimizer Parameters self.iter = 1 self.m = [ np.zeros(self.w_3.shape), np.zeros(self.b_3.shape), np.zeros(self.w_2.shape), np.zeros(self.b_2.shape), np.zeros(self.w_1.shape), np.zeros(self.b_1.shape), ] self.v = [ np.zeros(self.w_3.shape), np.zeros(self.b_3.shape), np.zeros(self.w_2.shape), np.zeros(self.b_2.shape), np.zeros(self.w_1.shape), np.zeros(self.b_1.shape), ] def _forward(self): self.net_h1 = np.dot(self.w_1, self.i) + self.b_1 self.out_h1 = relu(self.net_h1) self.net_h2 = np.dot(self.w_2, self.out_h1) + self.b_2 self.out_h2 = sigmoid(self.net_h2) self.net_o = np.dot(self.w_3, self.out_h2) + self.b_3 self.out_o = sigmoid(self.net_o) def _backward(self, optimizer): d_o_errors = - (self.t - self.out_o) self.delta_w_3 = np.dot(d_o_errors * d_sigmoid(self.net_o), self.out_h2.T) self.w_3 += optimizer(self, self.delta_w_3, self.iter, 0) self.delta_b_3 = d_o_errors * d_sigmoid(self.net_o) self.b_3 += optimizer(self, self.delta_b_3, self.iter, 1) d_h2_errors = np.dot(self.w_3.T, d_o_errors * d_sigmoid(self.net_o)) self.delta_w_2 = np.dot(d_h2_errors * d_sigmoid(self.net_h2), self.out_h1.T) self.w_2 += optimizer(self, self.delta_w_2, self.iter, 2) self.delta_b_2 = d_h2_errors * d_sigmoid(self.net_h2) self.b_2 += optimizer(self, self.delta_b_2, self.iter, 3) d_h1_errors = np.dot(self.w_2.T, d_h2_errors * d_sigmoid(self.net_h2)) self.delta_w_1 = np.dot(d_h1_errors * d_relu(self.net_h1), self.i.T) self.w_1 += optimizer(self, self.delta_w_1, self.iter, 4) self.delta_b_1 = d_h1_errors * d_relu(self.net_h1) self.b_1 += optimizer(self, self.delta_b_1, self.iter, 5) self.iter += 1 def train(self, optimizer = GD): self._forward() self._backward(optimizer) def query(self): self._forward() def result(self): print(self.t - self.out_o) nn = NeuralNetwork() # 출력 레이어에 activation(여기서는 sigmoid)이 없다면, 아래 처럼 최종 가중치 행렬의 delta 값과, 이전 가중치 행렬에 부여되는 에러값 계산이 다르다. # # ``` # def _forward(): # ... # # 최종 출력 레이어에 activation(sigmoid)이 없다면, # out_o = net_o # # def _backward(): # ... # # 최종 출력 레이어에 activation(sigmoid)이 없다면, # delta_w_3 = np.dot(d_o_errors, out_h2.T) # delta_b_3 = d_o_errors # ... # # 이전 레이어의 에러에도 activation 미분이 생략된다. # d_h2_errors = np.dot(w_3.T, d_o_errors) # ``` # # 히든 레이어의 w1에 대한 delta_w1 수식은 아래와 같다. # # $$\frac{\partial E_{total}}{\partial w_{1}} = (\sum\limits_{o}{\frac{\partial E_{total}}{\partial out_{o}} * \frac{\partial out_{o}}{\partial net_{o}} * \frac{\partial net_{o}}{\partial out_{h1}}}) * \frac{\partial out_{h1}}{\partial net_{h1}} * \frac{\partial net_{h1}}{\partial w_{1}}$$ # # $y_n$ 을 구하는 것이 역전파의 핵심이며 수식에서, $$\frac{\partial out_{o}}{\partial net_{o}} * \frac{\partial net_{o}}{\partial out_{h1}}$$ 부분이다. 즉, 출력 레이어의 activation 미분과 이전 가중치(w5, w6)를 곱한 값이 된다. # In[5]: gradient_checking(nn, 3) # In[6]: # 거의 비슷하여 정답으로 간주할 수 있으나 cs231n의 기준에는 미치지 못한다. gradient_checking(nn, 2) # In[7]: gradient_checking(nn, 1) # In[8]: # Gradient Descent 학습 nn.__init__() delta_w_1_history = [] w_1_history = [] for _ in range(7): delta_w_1_history.append([]) w_1_history.append([]) delta_b_1_history = [] b_1_history = [] for _ in range(2000): nn.train() for j in range(7): delta_w_1_history[j].append(nn.delta_w_1[1][j]) w_1_history[j].append(nn.w_1[1][j]) delta_b_1_history.append(nn.delta_b_1[1][0]) b_1_history.append(nn.b_1[1][0]) nn.query() nn.result() # In[9]: # plot with various axes scales plt.figure(1) for j in range(7): plt.subplot(221) plt.plot(delta_w_1_history[j]) plt.title("delta_w_1_history") plt.subplot(222) plt.plot(w_1_history[j]) plt.title("w_1_history") plt.subplot(223) plt.plot(delta_b_1_history) plt.title("delta_b_1_history") plt.subplot(224) plt.plot(b_1_history) plt.title("b_1_history") # Adjust the subplot layout, because the logit one may take more space # than usual, due to y-tick labels like "1 - 10^{-3}" # https://matplotlib.org/gallery/pyplots/pyplot_scales.html#sphx-glr-gallery-pyplots-pyplot-scales-py plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35) plt.show() # w_1의 기울기가 0에 이르는 지점이 bias 학습이 함께 진행될때는 1,200 epoch 정도이나, # bias 학습을 하지 않으면 1,500 epoch 이상을 넘어선다. # In[10]: nn.w_1 # In[11]: # Adam 학습 nn.__init__() adam_w_1_history = [] adam_b_1_history = [] for _ in range(2000): nn.train(adam) adam_w_1_history.append(nn.w_1[1][0]) adam_b_1_history.append(nn.b_1[1][0]) nn.query() nn.result() # In[12]: nn.w_1 # In[13]: # Momentum 학습 nn.__init__() momentum_w_1_history = [] momentum_b_1_history = [] for _ in range(2000): nn.train(momentum) momentum_w_1_history.append(nn.w_1[1][0]) momentum_b_1_history.append(nn.b_1[1][0]) nn.query() nn.result() # In[14]: nn.w_1 # In[15]: plt.figure(1) plt.subplot(221) plt.plot(adam_w_1_history) plt.plot(w_1_history[0]) plt.title("w_1_history") plt.subplot(222) plt.plot(adam_b_1_history) plt.plot(b_1_history) plt.title("b_1_history") plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35) plt.show() # Adam의 학습 시간은 GD의 1/20 수준에 불과했다. 도달하는 값은 많이 다르다. # In[16]: plt.figure(1) plt.subplot(221) plt.plot(momentum_w_1_history) plt.plot(w_1_history[0]) plt.title("w_1_history") plt.subplot(222) plt.plot(momentum_b_1_history) plt.plot(b_1_history) plt.title("b_1_history") plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35) plt.show() # Momentum은 GD와 거의 비슷한 값에 도달하는데, 학습 시간은 1/12 수준에 불과했다. # In[17]: plt.figure(1) plt.subplot(221) plt.plot(momentum_w_1_history[:200]) plt.plot(adam_w_1_history[:200]) plt.title("w_1_history") plt.subplot(222) plt.plot(momentum_b_1_history[:200]) plt.plot(adam_b_1_history[:200]) plt.title("b_1_history") plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35) plt.show() # Adam과 Momentum의 비교, 하락폭이 큰 쪽이 Adam 이다. # In[18]: # Adam의 m, v 조회 nn.__init__() adam_m_1_history = [] adam_v_1_history = [] adam_m_1_hat_history = [] adam_v_1_hat_history = [] update_parameters = [] for _ in range(200): nn.train(adam) adam_m_1_history.append(nn.m[1][0]) # m_k_hat 기준과 동일하게 맞추려면 마지막 레이어인 5로 지정한다. adam_v_1_history.append(nn.v[1][0]) adam_m_1_hat_history.append(nn.m_k_hat[5]) adam_v_1_hat_history.append(nn.v_k_hat[5]) update_parameters.append(nn.update_parameters[5]) nn.query() nn.result() # In[19]: plt.figure(1) plt.subplot(221) plt.plot(adam_m_1_history) plt.title("adam_m_1_history") plt.subplot(222) plt.plot(adam_v_1_history) plt.title("adam_v_1_history") plt.subplot(223) plt.plot(adam_m_1_hat_history) plt.title("adam_m_1_hat_history") plt.subplot(224) plt.plot(adam_v_1_hat_history) plt.title("adam_v_1_hat_history") plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5, wspace=0.35) plt.show() # Adam의 m, v 조회 # In[20]: plt.subplot(224) plt.plot(update_parameters) plt.title("update_parameters") plt.show()