import sympy
def erase_buf(method):
try:
delattr(method,'buf')
except AttributeError:
pass
def load_buf(method):
try:
v=getattr(method,'buf')
except AttributeError:
v=0
return v
def opt(method,function,starting_point,learning_rate=0.12,steps=10):
erase_buf(method)
x,y=sympy.symbols('x,y')
point=starting_point
plist=[point]
fx=function.diff(x)
fy=function.diff(y)
for i in range(steps):
point=method(function,fx,fy,point,learning_rate)
plist.append(point)
return plist
import sympy
import numpy as np
import math
# Stochastic Gradient Descent
def sgd_step(f,fx,fy,point,learning_rate=0.1):
x,y=sympy.symbols('x,y')
dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1])))
x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier
x=x-learning_rate*dx
return (x[0],x[1])
# SGD with Momentum
def momentum_step(f,fx,fy,point,learning_rate=0.1,mu=0.5):
# load buf
v=load_buf(momentum_step)
x,y=sympy.symbols('x,y')
dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1])))
x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier
v=mu*v-learning_rate*dx
x=x+v
# save buf
momentum_step.buf = v
return (x[0],x[1])
# Nesterov Momentum
def nag_step(f,fx,fy,point,learning_rate=0.1,mu=0.5):
v=np.zeros(2)+load_buf(nag_step)
# compute gradient
x,y=sympy.symbols('x,y')
dx=np.array((fx.subs(x,point[0]+mu*v[0]).subs(y,point[1]+mu*v[1]),
fy.subs(x,point[0]+mu*v[0]).subs(y,point[1]+mu*v[1])))
#print (dx)
x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier
# update
v = mu*v-learning_rate*dx
x=x+v
# save buf
nag_step.buf = v
return (x[0],x[1])
# AdaGrad
def adagrad_step(f,fx,fy,point,learning_rate=0.1):
cache = load_buf(adagrad_step)
x,y=sympy.symbols('x,y')
dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1])))
x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier
cache += dx[0]**2 + dx[1]**2
x=x-learning_rate*dx/(math.sqrt(cache)+1e-7)
adagrad_step.buf=cache
return (x[0],x[1])
# RMSProp
def rmsprop_step(f,fx,fy,point,learning_rate=0.1,decay_rate=0.5):
cache = load_buf(rmsprop_step)
x,y=sympy.symbols('x,y')
dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1])))
x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier
cache = decay_rate*cache + (1-decay_rate)*(dx[0]**2 + dx[1]**2)
x=x-learning_rate*dx/(math.sqrt(cache)+1e-7)
rmsprop_step.buf=cache
return (x[0],x[1])
# Adam
def adam_step(f,fx,fy,point,learning_rate=0.1,beta1=0.5,beta2=0.5):
buf = np.zeros(4)+load_buf(adam_step)
t=buf[0]
m=buf[1:3]
v=buf[3]
x,y=sympy.symbols('x,y')
dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1])))
x=np.array(point) # not a good symbol, try to match stanford note, unfortunately symbol used earlier
t=t+1
m=beta1*m+(1-beta1)*dx
v=beta2*v+(1-beta2)*(dx[0]**2 + dx[1]**2)
mb=m/(1-beta1**t)
vb=v/(1-beta2**t)
x=x-learning_rate*mb/(math.sqrt(vb)+1e-7)
adam_step.buf=np.concatenate([np.ones(1)*t,m,np.ones(1)*v])
return (x[0],x[1])
# Newton
def newton_step(f,fx,fy,point,learning_rate=0.1,beta1=0.5,beta2=0.5):
x,y=sympy.symbols('x,y')
H=np.zeros((2,2))
H[0,0]=fx.diff(x).subs(x,point[0]).subs(y,point[1])
H[0,1]=fx.diff(y).subs(x,point[0]).subs(y,point[1])
H[1,0]=fy.diff(x).subs(x,point[0]).subs(y,point[1])
H[1,1]=fy.diff(y).subs(x,point[0]).subs(y,point[1])
dx=np.array((fx.subs(x,point[0]).subs(y,point[1]),fy.subs(x,point[0]).subs(y,point[1])))
x=np.array(point)-np.linalg.inv(H).dot(dx)
return (x[0],x[1])
import matplotlib.pyplot as plt
import string
import numpy as np
%matplotlib inline
def del_variable(var):
try:
del globals()[var]
except:
pass
def del_variables(varlist):
for var in varlist:
del_variable(var)
def form_legend_list(varlist):
li=[]
for var in varlist:
if var in globals():
li.append(var)
def plot_contour(z,rang=(-10,10,-10,10),delta=0.05):
xr=np.arange(rang[0],rang[1],delta)
yr=np.arange(rang[2],rang[3],delta)
[X,Y]=np.meshgrid(xr,yr)
cmd=string.upper(str(z))
Z=eval(cmd)
plt.contour(X,Y,Z)
import sympy
x,y=sympy.symbols('x,y')
z=x**2 + 8*y**2 # define function
sgd_learning_rate=0.12 #0.12
momentum_learning_rate=0.12 # more stable than sgd
nag_learning_rate=0.09 # less stable than sgd
ada_learning_rate=20 # very stable
rms_learning_rate=3 # tend to ocillate near optimum
adam_learning_rate=4 # more stable than rmsprop
starting_point=(9,9)
num_step=20
plist_sgd=opt(sgd_step,z,starting_point,sgd_learning_rate,num_step)
plist_momentum=opt(momentum_step,z,starting_point,momentum_learning_rate,num_step)
plist_nag=opt(nag_step,z,starting_point,nag_learning_rate,num_step)
plist_ada=opt(adagrad_step,z,starting_point,ada_learning_rate,num_step)
plist_rmsprop=opt(rmsprop_step,z,starting_point,rms_learning_rate,num_step)
plist_adam=opt(adam_step,z,starting_point,adam_learning_rate,num_step)
plist_newton=opt(newton_step,z,starting_point,0,num_step) # learning rate (0) is a dummy
plot_contour(z)
del_variables(['sgd','momentum','nag','ada','rmsprop','adam','newton'])
sgd,=plt.plot(np.array(plist_sgd).T[0],np.array(plist_sgd).T[1],label='sgd')
momentum,=plt.plot(np.array(plist_momentum).T[0],np.array(plist_momentum).T[1],label='momentum')
nag,=plt.plot(np.array(plist_nag).T[0],np.array(plist_nag).T[1],label='nag')
ada,=plt.plot(np.array(plist_ada).T[0],np.array(plist_ada).T[1],label='adagrad')
rmsprop,=plt.plot(np.array(plist_rmsprop).T[0],np.array(plist_rmsprop).T[1],label='rmsprop')
adam,=plt.plot(np.array(plist_adam).T[0],np.array(plist_adam).T[1],label='adam')
newton,=plt.plot(np.array(plist_newton).T[0],np.array(plist_newton).T[1],label='newton')
legend_list=form_legend_list(['sgd','momentum','nag','ada','rmsprop','adam','newton'])
plt.legend(bbox_to_anchor=(0.4, 1),handles=legend_list)
<matplotlib.legend.Legend at 0x7f457875afd0>
import sympy
x,y=sympy.symbols('x,y')
z=x**2 + 4*y**4 # define function
sgd_learning_rate=0.001 #0.12
momentum_learning_rate=0.001 # more stable than sgd
nag_learning_rate=0.001 # less stable than sgd
ada_learning_rate=10 # very stable (converge very slowly)
rms_learning_rate=10 # occillate near optimum
adam_learning_rate=5 # even more stable than rmsprop
starting_point=(9,9)
num_step=20
plist_sgd=opt(sgd_step,z,starting_point,sgd_learning_rate,num_step)
plist_momentum=opt(momentum_step,z,starting_point,momentum_learning_rate,num_step)
plist_nag=opt(nag_step,z,starting_point,nag_learning_rate,num_step)
plist_ada=opt(adagrad_step,z,starting_point,ada_learning_rate,num_step)
plist_rmsprop=opt(rmsprop_step,z,starting_point,rms_learning_rate,num_step)
plist_adam=opt(adam_step,z,starting_point,adam_learning_rate,num_step)
plist_newton=opt(newton_step,z,starting_point,0,num_step) # learning rate (0) is a dummy
plot_contour(z)
del_variables(['sgd','momentum','nag','ada','rmsprop','adam','newton'])
sgd,=plt.plot(np.array(plist_sgd).T[0],np.array(plist_sgd).T[1],label='sgd')
momentum,=plt.plot(np.array(plist_momentum).T[0],np.array(plist_momentum).T[1],label='momentum')
nag,=plt.plot(np.array(plist_nag).T[0],np.array(plist_nag).T[1],label='nag')
ada,=plt.plot(np.array(plist_ada).T[0],np.array(plist_ada).T[1],label='adagrad')
rmsprop,=plt.plot(np.array(plist_rmsprop).T[0],np.array(plist_rmsprop).T[1],label='rmsprop')
adam,=plt.plot(np.array(plist_adam).T[0],np.array(plist_adam).T[1],label='adam')
newton,=plt.plot(np.array(plist_newton).T[0],np.array(plist_newton).T[1],label='newton')
legend_list=form_legend_list(['sgd','momentum','nag','ada','rmsprop','adam','newton'])
plt.legend(bbox_to_anchor=(0.4, 1),handles=legend_list)
<matplotlib.legend.Legend at 0x7f4578616f90>