#!/usr/bin/env python # coding: utf-8 # ### define optimization methods # #### Generic optimizer # In[43]: import sympy def erase_buf(method): try: delattr(method,'buf') except AttributeError: pass def load_buf(method): try: v=getattr(method,'buf') except AttributeError: v=0 return v def opt(method,function,starting_point,learning_rate=0.12,steps=10): erase_buf(method) x,y=sympy.symbols('x,y') point=starting_point plist=[point] fx=function.diff(x) fy=function.diff(y) for i in range(steps): point=method(function,fx,fy,point,learning_rate) plist.append(point) return plist # #### Optimizers # In[44]: import sympy import numpy as np import math # Stochastic Gradient Descent def sgd_step(f,fx,fy,point,learning_rate=0.1): x,y=sympy.symbols('x,y') dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1]))) x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier x=x-learning_rate*dx return (x[0],x[1]) # SGD with Momentum def momentum_step(f,fx,fy,point,learning_rate=0.1,mu=0.5): # load buf v=load_buf(momentum_step) x,y=sympy.symbols('x,y') dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1]))) x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier v=mu*v-learning_rate*dx x=x+v # save buf momentum_step.buf = v return (x[0],x[1]) # Nesterov Momentum def nag_step(f,fx,fy,point,learning_rate=0.1,mu=0.5): v=np.zeros(2)+load_buf(nag_step) # compute gradient x,y=sympy.symbols('x,y') dx=np.array((fx.subs(x,point[0]+mu*v[0]).subs(y,point[1]+mu*v[1]), fy.subs(x,point[0]+mu*v[0]).subs(y,point[1]+mu*v[1]))) #print (dx) x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier # update v = mu*v-learning_rate*dx x=x+v # save buf nag_step.buf = v return (x[0],x[1]) # AdaGrad def adagrad_step(f,fx,fy,point,learning_rate=0.1): cache = load_buf(adagrad_step) x,y=sympy.symbols('x,y') dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1]))) x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier cache += dx[0]**2 + dx[1]**2 x=x-learning_rate*dx/(math.sqrt(cache)+1e-7) adagrad_step.buf=cache return (x[0],x[1]) # RMSProp def rmsprop_step(f,fx,fy,point,learning_rate=0.1,decay_rate=0.5): cache = load_buf(rmsprop_step) x,y=sympy.symbols('x,y') dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1]))) x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier cache = decay_rate*cache + (1-decay_rate)*(dx[0]**2 + dx[1]**2) x=x-learning_rate*dx/(math.sqrt(cache)+1e-7) rmsprop_step.buf=cache return (x[0],x[1]) # Adam def adam_step(f,fx,fy,point,learning_rate=0.1,beta1=0.5,beta2=0.5): buf = np.zeros(4)+load_buf(adam_step) t=buf[0] m=buf[1:3] v=buf[3] x,y=sympy.symbols('x,y') dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1]))) x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier t=t+1 m=beta1*m+(1-beta1)*dx v=beta2*v+(1-beta2)*(dx[0]**2 + dx[1]**2) mb=m/(1-beta1**t) vb=v/(1-beta2**t) x=x-learning_rate*mb/(math.sqrt(vb)+1e-7) adam_step.buf=np.concatenate([np.ones(1)*t,m,np.ones(1)*v]) return (x[0],x[1]) # Newton def newton_step(f,fx,fy,point,learning_rate=0.1,beta1=0.5,beta2=0.5): x,y=sympy.symbols('x,y') H=np.zeros((2,2)) H[0,0]=fx.diff(x).subs(x,point[0]).subs(y,point[1]) H[0,1]=fx.diff(y).subs(x,point[0]).subs(y,point[1]) H[1,0]=fy.diff(x).subs(x,point[0]).subs(y,point[1]) H[1,1]=fy.diff(y).subs(x,point[0]).subs(y,point[1]) dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1]))) x=np.array(point)-np.linalg.inv(H).dot(dx) return (x[0],x[1]) # In[60]: import matplotlib.pyplot as plt import string import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') def del_variable(var): try: del globals()[var] except: pass def del_variables(varlist): for var in varlist: del_variable(var) def form_legend_list(varlist): li=[] for var in varlist: if var in globals(): li.append(var) def plot_contour(z,rang=(-10,10,-10,10),delta=0.05): xr=np.arange(rang[0],rang[1],delta) yr=np.arange(rang[2],rang[3],delta) [X,Y]=np.meshgrid(xr,yr) cmd=string.upper(str(z)) Z=eval(cmd) plt.contour(X,Y,Z) # ### Main # In[84]: import sympy x,y=sympy.symbols('x,y') z=x**2 + 8*y**2 # define function sgd_learning_rate=0.12 #0.12 momentum_learning_rate=0.12 # more stable than sgd nag_learning_rate=0.09 # less stable than sgd ada_learning_rate=20 # very stable rms_learning_rate=3 # tend to ocillate near optimum adam_learning_rate=4 # more stable than rmsprop starting_point=(9,9) num_step=20 plist_sgd=opt(sgd_step,z,starting_point,sgd_learning_rate,num_step) plist_momentum=opt(momentum_step,z,starting_point,momentum_learning_rate,num_step) plist_nag=opt(nag_step,z,starting_point,nag_learning_rate,num_step) plist_ada=opt(adagrad_step,z,starting_point,ada_learning_rate,num_step) plist_rmsprop=opt(rmsprop_step,z,starting_point,rms_learning_rate,num_step) plist_adam=opt(adam_step,z,starting_point,adam_learning_rate,num_step) plist_newton=opt(newton_step,z,starting_point,0,num_step) # learning rate (0) is a dummy plot_contour(z) del_variables(['sgd','momentum','nag','ada','rmsprop','adam','newton']) sgd,=plt.plot(np.array(plist_sgd).T[0],np.array(plist_sgd).T[1],label='sgd') momentum,=plt.plot(np.array(plist_momentum).T[0],np.array(plist_momentum).T[1],label='momentum') nag,=plt.plot(np.array(plist_nag).T[0],np.array(plist_nag).T[1],label='nag') ada,=plt.plot(np.array(plist_ada).T[0],np.array(plist_ada).T[1],label='adagrad') rmsprop,=plt.plot(np.array(plist_rmsprop).T[0],np.array(plist_rmsprop).T[1],label='rmsprop') adam,=plt.plot(np.array(plist_adam).T[0],np.array(plist_adam).T[1],label='adam') newton,=plt.plot(np.array(plist_newton).T[0],np.array(plist_newton).T[1],label='newton') legend_list=form_legend_list(['sgd','momentum','nag','ada','rmsprop','adam','newton']) plt.legend(bbox_to_anchor=(0.4, 1),handles=legend_list) # In[85]: import sympy x,y=sympy.symbols('x,y') z=x**2 + 4*y**4 # define function sgd_learning_rate=0.001 #0.12 momentum_learning_rate=0.001 # more stable than sgd nag_learning_rate=0.001 # less stable than sgd ada_learning_rate=10 # very stable (converge very slowly) rms_learning_rate=10 # occillate near optimum adam_learning_rate=5 # even more stable than rmsprop starting_point=(9,9) num_step=20 plist_sgd=opt(sgd_step,z,starting_point,sgd_learning_rate,num_step) plist_momentum=opt(momentum_step,z,starting_point,momentum_learning_rate,num_step) plist_nag=opt(nag_step,z,starting_point,nag_learning_rate,num_step) plist_ada=opt(adagrad_step,z,starting_point,ada_learning_rate,num_step) plist_rmsprop=opt(rmsprop_step,z,starting_point,rms_learning_rate,num_step) plist_adam=opt(adam_step,z,starting_point,adam_learning_rate,num_step) plist_newton=opt(newton_step,z,starting_point,0,num_step) # learning rate (0) is a dummy plot_contour(z) del_variables(['sgd','momentum','nag','ada','rmsprop','adam','newton']) sgd,=plt.plot(np.array(plist_sgd).T[0],np.array(plist_sgd).T[1],label='sgd') momentum,=plt.plot(np.array(plist_momentum).T[0],np.array(plist_momentum).T[1],label='momentum') nag,=plt.plot(np.array(plist_nag).T[0],np.array(plist_nag).T[1],label='nag') ada,=plt.plot(np.array(plist_ada).T[0],np.array(plist_ada).T[1],label='adagrad') rmsprop,=plt.plot(np.array(plist_rmsprop).T[0],np.array(plist_rmsprop).T[1],label='rmsprop') adam,=plt.plot(np.array(plist_adam).T[0],np.array(plist_adam).T[1],label='adam') newton,=plt.plot(np.array(plist_newton).T[0],np.array(plist_newton).T[1],label='newton') legend_list=form_legend_list(['sgd','momentum','nag','ada','rmsprop','adam','newton']) plt.legend(bbox_to_anchor=(0.4, 1),handles=legend_list) # In[ ]: