#!/usr/bin/env python # coding: utf-8 # # gradient descent (경사 하강법) # - Gradient descent 설명 # - http://nobilitycat.tistory.com/entry/Gradient-Descent # - Univariate Linear Regression 에서 gradient descent (경사 하강법) 설명 # - Hypothesis Function # $$h_{\theta_0, \theta_1}(x) = \theta_0 + \theta_1 \cdot x$$ # - Cost Function $J(\theta_0, \theta_1)$ # \begin{eqnarray} # J(\theta_0, \theta_1) &=& \dfrac{1}{2m} \sum_{i=1}^m \big( h_{\theta_0, \theta_1}(x^i) - y^i \big)^2 \\ # &=& \dfrac{1}{2m}\sum_{i=1}^m \big( \theta_0 + \theta_1\cdot x^i - y^i \big)^2 # \end{eqnarray} # - We need derivatives for both $\theta_0$ and $\theta_1$: # \begin{eqnarray} # \frac{\partial}{\partial \theta_0}J(\theta_0, \theta_1) &=& \frac{1}{m} \sum_{i=1}^m (\theta_0 + \theta_1 x^{i}-y^{i}) \\ # \frac{\partial}{\partial \theta_1}J(\theta_0, \theta_1) &=& \frac{1}{m} \sum_{i=1}^m (\theta_0 + \theta_1 x^{i}-y^{i})\cdot x^{i} # \end{eqnarray} # #### Batch Gradient Descent # - 1) Pick an initial value for $\hat \theta $. # - 2) Compute # $$ \frac{\partial J}{\partial \theta} = \big( \frac{\partial J(\theta_0, \theta_1)}{\partial \theta_0}, \frac{\partial J(\theta_0, \theta_1)}{\partial \theta_1} \big) $$ # - 3) Compute with a proper learning rate $\alpha$ # $$temp_0 := \theta_0 -\alpha \frac{\partial}{\partial \theta_0}J(\theta_0, \theta_1)\\ # temp_1 := \theta_1 -\alpha \frac{\partial}{\partial \theta_1}J(\theta_0, \theta_1)$$ # $$\theta_0 := temp_0\\ # \theta_1 := temp_1.$$ # # - 4) Repeat 2) and 3) until update is small or reaches iteration maximum. # #### Stochastic (or Incremental) Gradient Descent # - 1) Pick an initial value for $\hat \theta $. # - 2) Compute # $$ \frac{\partial J}{\partial \theta} = \big( \frac{\partial J(\theta_0, \theta_1)}{\partial \theta_0}, \frac{\partial J(\theta_0, \theta_1)}{\partial \theta_1} \big) $$ # - 3) Compute with a proper learning rate $\alpha$ # $$\theta_0 := \theta_0 -\alpha \frac{\partial}{\partial \theta_0}J(\theta_0, \theta_1)\\ # \theta_1 := \theta_1 -\alpha \frac{\partial}{\partial \theta_1}J(\theta_0, \theta_1).$$ # # - 4) Repeat 2) and 3) until update is small or reaches iteration maximum. # - This method converges to the minimum more rapidly, but has the potential of overshooting the minimum and then oscillating around it. # #### Learning Rate control # - By slowly letting the learning rate $\alpha$ decrease to zero as the algorithm runs, it is also possible to ensure that the parameters will converge to the global minimum rather then merely oscillate around the minimum. # #### Local Optimum vs. Global optimum # - For linear regression, the cost function $J(\theta)$ does not have a local optimum other than the global optimum. #

# - However, we need to be susceptible to local optima in general cases. #

# #### Barch Gradient Descent - Python Code. # - We get $\theta_0$ and $\theta_1$ as its output: # In[2]: import numpy as np import random import sklearn from sklearn.datasets.samples_generator import make_regression import pylab from scipy import stats def gradient_descent(alpha, x, y, ep=0.0001, max_iter=10000): converged = False iter = 0 m = x.shape[0] # number of samples # initial theta t0 = np.random.random(x.shape[1]) t1 = np.random.random(x.shape[1]) # total error, J(theta) J = sum([(t0 + t1*x[i] - y[i])**2 for i in range(m)]) # Iterate Loop while not converged: # for each training sample, compute the gradient (d/d_theta j(theta)) grad0 = 1.0/m * sum([(t0 + t1*x[i] - y[i]) for i in range(m)]) grad1 = 1.0/m * sum([(t0 + t1*x[i] - y[i])*x[i] for i in range(m)]) # update the theta_temp temp0 = t0 - alpha * grad0 temp1 = t1 - alpha * grad1 # update theta t0 = temp0 t1 = temp1 # mean squared error e = sum( [ (t0 + t1*x[i] - y[i])**2 for i in range(m)] ) if abs(J-e) <= ep: print 'Converged, iterations: ', iter, '!!!' converged = True J = e # update error iter += 1 # update iter if iter == max_iter: print 'Max interactions exceeded!' converged = True return t0,t1 # - Data Preparation # - [Note]: http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html # In[3]: from sklearn.datasets.samples_generator import make_regression x, y = make_regression(n_samples=100, n_features=1, n_informative=1, noise=35) print 'x.shape = %s, y.shape = %s' % (x.shape, y.shape) print x[0:5] print y[0:5] # - Do gradient descent! # In[4]: alpha = 0.01 # learning rate ep = 0.01 # convergence criteria # call gredient decent, and get intercept(=theta0) and slope(=theta1) theta0, theta1 = gradient_descent(alpha, x, y, ep, max_iter=10000) print ('theta0 = %s, theta1 = %s') %(theta0, theta1) # check with scipy linear regression slope, intercept, r_value, p_value, slope_std_error = stats.linregress(x[:,0], y) print ('intercept = %s, slope = %s') %(intercept, slope) # In[6]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') fig = plt.figure(figsize=(9, 8)) ax1 = fig.add_subplot(111) ax1.scatter(x[:,0], y) y_predict = theta0 + theta1*x ax1.plot(x[:,0], y_predict)