(a) Plot the Rosenbrock function [Rosenbrock, 1960]
$f = (1 - x)^2 + 100(y - x^2)^2$
# From Neil's nm.py code
import copy as cp
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
def f(x):
global evals
evals += 1
return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2
plot_size = 100
xy_min = -1.1
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_proj_type('ortho')
surf = ax.plot_surface(X, Y, Z)
ax.view_init(35, 59)
plt.title("Rosenbrock Function")
plt.show()
(b) Pick a stopping criterion and use the downhill simplex method to search for its minimum starting from $x = y = -1$, plotting the search path, and counting the number of algorithm iterations and function evaluations used.
evals = 0
def nelder_mead(f, start, start_size, tolerance):
global evals
evals = 0
d = start.size
x = []
x.append({'pt':cp.deepcopy(start), 'fn':f(start)})
for i in range(d):
x.append({'pt':cp.deepcopy(start), 'fn':0.0})
x[-1]['pt'][i] += start_size
x[-1]['fn'] = f(x[-1]['pt'])
x_s = []
x.sort(key=lambda item:item['fn'])
x_s.append(cp.deepcopy(x))
while True:
mid = np.zeros(d)
for i in range(d):
mid += x[i]['pt']
mid = mid/d
y = mid + (mid - x[-1]['pt'])
f_y = f(y)
if (f_y < x[0]['fn']):
z = mid + 2.0*(mid - x[-1]['pt'])
f_z = f(z)
if (f_z < f_y):
x[-1] = {'pt':z, 'fn':f_z}
else:
x[-1] = {'pt':y, 'fn':f_y}
elif (f_y < x[-2]['fn']):
x[-1] = {'pt':y, 'fn':f_y}
else:
y = mid + 0.5*(mid - x[-1]['pt'])
f_y = f(y)
if (f_y < x[-2]['fn']):
x[-1] = {'pt':y, 'fn':f_y}
else:
y = mid - 0.5*(mid - x[-1]['pt'])
f_y = f(y)
if (f_y < x[-2]['fn']):
x[-1] = {'pt':y, 'fn':f_y}
else:
for i in range(1, d+1):
x[i]['pt'] = (x[0]['pt'] + x[i]['pt'])/2.0
x[i]['fn'] = f(x[i]['pt'])
x.sort(key=lambda item:item['fn'])
x_s.append(cp.deepcopy(x))
error = abs(x_s[-1][0]['fn'] - x_s[-2][0]['fn'])
if ((error > 0) & (error < tolerance)):
break
return x_s
D = 2
start_size = 0.1
tolerance = 1e-6
start = -1*np.ones(D)
x_s = nelder_mead(f, start, start_size, tolerance)
plot_size = 100
xy_min = -1.1
xy_max = 1.3
x = np.arange(xy_min,xy_max,(xy_max-xy_min)/100)
y = np.arange(xy_min,xy_max,(xy_max-xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
plt.ion()
point = ''
for i in range(len(start)):
point += '%.3f '%x_s[-1][0]['pt'][i]
title = 'Rosenbrock Nelder-Mead search'\
+'\nfinal point: ' + point\
+'\ntolerance: ' + str(tolerance)\
+'\niterations: ' + str(len(x_s))\
+' function evaluations: ' + str(evals)
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_proj_type('ortho')
surf = ax.plot_surface(X, Y, Z)
ax.view_init(35, 59)
for i in range(len(x_s)):
x = []
y = []
z = []
x.append(x_s[i][0]['pt'][0])
y.append(x_s[i][0]['pt'][1])
z.append(x_s[i][0]['fn'])
x.append(x_s[i][1]['pt'][0])
y.append(x_s[i][1]['pt'][1])
z.append(x_s[i][1]['fn'])
x.append(x_s[i][2]['pt'][0])
y.append(x_s[i][2]['pt'][1])
z.append(x_s[i][2]['fn'])
x.append(x_s[i][0]['pt'][0])
y.append(x_s[i][0]['pt'][1])
z.append(x_s[i][0]['fn'])
ax.plot(x, y, z, color='black', marker='.')
plt.title(title)
plt.show()
(c) Now repeat the simplex search for $D = 10$ in the D-dimensional generalizations of the Rosenbrock function:
$f = \sum_{i=1}^{D-1} (1 - x_{i-1})^2 + 100(x_i - x_{i-1}^2)^2$
Start all the $x_i$ at -1, plot a 2D projection of the search path, and again count algorithm iterations and function evaluations.
plot_size = 100
xy_min = -1.5
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
def f(x):
global evals, D
evals += 1
sum = 0
for i in range(1, D):
sum += (1 - x[i-1])**2 + 100*(x[i] - x[i-1]**2)**2
return sum
D = 10
start_size = 0.1
tolerance = 1e-6
start = -1*np.ones(D)
x_s = nelder_mead(f, start, start_size, tolerance)
point = ''
for i in range(len(start)):
point += '%.3f '%x_s[-1][0]['pt'][i]
title = '10D Rosenbrock Nelder-Mead search'\
+ '\nfinal point: ' + point\
+ '\ntolerance: ' + str(tolerance)\
+ '\niterations: ' + str(len(x_s))\
+ ' function evaluations: ' + str(evals)
fig, ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=0.5)
x = [[],[],[],[]]
y = [[],[],[],[]]
for i in range(len(x_s)):
x[0].append(x_s[i][0]['pt'][0])
y[0].append(x_s[i][0]['pt'][1])
x[1].append(x_s[i][1]['pt'][0])
y[1].append(x_s[i][1]['pt'][1])
x[2].append(x_s[i][2]['pt'][0])
y[2].append(x_s[i][2]['pt'][1])
x[3].append(x_s[i][0]['pt'][0])
y[3].append(x_s[i][0]['pt'][1])
ax.plot(x, y, color='white', marker='.')
plt.title(title)
plt.show()
(d) Repeat and compare the 2D and 10D Rosenbrock search with the gradient descent method.
# From Neil's gd.py code
import copy as cp
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
evals = 0
def gradient_descent(f, df, start, start_scale, rate, momentum, tolerance):
scale = start_scale
x = np.copy(start)
last_x = np.copy(start)
last_last_x = np.copy(start)
pts = [{'pt':np.copy(x), 'fn':f(x)}]
while True:
last_last_x = np.copy(last_x)
last_x = np.copy(x)
x_new = x - scale*df(x) + momentum*(last_x - last_last_x)
f_n = f(x_new)
if (f_n <= pts[-1]['fn']):
x = np.copy(x_new)
pts.append({'pt':np.copy(x), 'fn':f_n})
scale *= rate
error = abs(pts[-1]['fn'] - pts[-2]['fn'])
if (error < tolerance):
break
else:
scale *= 1/rate
return pts
def f(x):
global evals
evals += 1
return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2
def df(x):
global evals
evals += 2
return np.array([-2*(1 - x[0]) - 400*(x[1] - x[0]**2)*x[0], 200*(x[1] - x[0]**2)])
D = 2
start = -1*np.ones(D)
start_scale = 1e-5
rate = 1.5
momentum = 0.95
tolerance = 1e-6
pts = gradient_descent(f, df, start, start_scale, rate, momentum, tolerance)
plot_size = 100
xy_min = -1.1
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
x = []
y = []
z = []
for i in range(len(pts)):
x.append(pts[i]['pt'][0])
y.append(pts[i]['pt'][1])
z.append(pts[i]['fn'])
point = ''
for i in range(len(start)):
point += '%.3f '%pts[-1]['pt'][i]
title = '2D Rosenbrock gradient descent search (with momentum)'\
+ '\nfinal point: ' + point\
+ '\ntolerance: ' + str(tolerance)\
+ '\niterations: ' + str(len(pts))\
+ ' function evaultions ' + str(evals)
plt.ion()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_proj_type('ortho')
ax.plot_surface(X, Y, Z)
ax.view_init(35, 59)
ax.plot(x, y, z, color='black')
plt.title(title)
plt.show()
plot_size = 100
xy_min = -1.5
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
def f(x):
global evals, D
evals += 1
sum = 0
for i in range(1, D):
sum += (1 - x[i-1])**2 + 100*(x[i] - x[i-1]**2)**2
return sum
def df(x):
global evals, D
evals += D
dim = np.zeros(D)
dim[0] = -2*(1 - x[0]) - 400*(x[1] - x[0]**2)*x[0]
dim[D-1] = 200*(x[D-1] - x[D-2]**2)
for i in range(1, D-1):
dim[i] = -2*(1 - x[i]) - 400*(x[i+1] - x[i]**2)*x[i] + 200*(x[i] - x[i-1]**2)
return dim
D = 10
start = -1*np.zeros(D)
start_scale = 1e-5
rate = 1.5
tolerance = 1e-6
momentum = 0.95
pts = gradient_descent(f, df, start, start_scale, rate, momentum, tolerance)
point = ''
for i in range(len(start)):
point += '%.3f '%pts[-1]['pt'][i]
title = '10D Rosenbrock gradient descent search (with momentum)'\
+ '\nfinal point: ' + point\
+ '\ntolerance: ' + str(tolerance)\
+ '\niterations: ' + str(len(pts))\
+ ' function evaluations: ' + str(evals)
x = []
y = []
z = []
for i in range(len(pts)):
x.append(pts[i]['pt'][0])
y.append(pts[i]['pt'][1])
z.append(pts[i]['fn'])
fig, ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=0.5)
ax.plot(x, y, color='white')
plt.title(title)
plt.show()
(e) Repeat and compare the 2D and 10D Rosenbrock search with the conjugate gradient method.
# From Neil's cg.py code
import copy as cp
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
evals = 0
def conjugate_gradient(f, df, start, start_step, tolerance):
golden = (1 + math.sqrt(5))/2
pts = []
t0 = {}
t1 = {'pt':np.copy(start), 'fn':f(start)}
t2 = {}
t3 = {}
pts.append(cp.deepcopy(t1))
dir = df(start)
grad = df(start)
step = 0
step_size = start_step
while True:
step += 1
dir_old = np.copy(dir)
grad_old = np.copy(grad)
grad = df(t1['pt'])
gamma = np.dot(grad, grad - grad_old)/np.dot(grad_old, grad_old)
dir = grad + gamma*dir_old
if (step % (dim + 1) == 0):
dir = np.copy(grad)
norm_dir = -dir/np.sqrt(np.dot(dir, dir))
t0 = cp.deepcopy(t1)
while True:
t1['pt'] = t0['pt'] + step_size*norm_dir
t1['fn'] = f(t1['pt'])
if (t1['fn'] > t0['fn']):
step_size *= 0.5
continue
t2['pt'] = t0['pt'] + (t1['pt'] - t0['pt'])*golden
t2['fn'] = f(t2['pt'])
break
while True:
if (t1['fn'] < t2['fn']):
break
t0 = cp.deepcopy(t1)
t1 = cp.deepcopy(t2)
t2['pt'] = t1['pt'] + golden*(t1['pt'] - t0['pt'])
t2['fn'] = f(t2['pt'])
while True:
t3['pt'] = t1['pt'] + (t1['pt'] - t0['pt'])/golden
t3['fn'] = f(t3['pt'])
if(t3['fn'] > t1['fn']):
t2 = cp.deepcopy(t0)
t0 = cp.deepcopy(t3)
else:
t0 = cp.deepcopy(t1)
t1 = cp.deepcopy(t3)
error = abs(t1['fn'] - t0['fn']) + abs(t2['fn'] - t1['fn'])
if (error < tolerance):
break
pts.append(cp.deepcopy(t1))
error = abs(pts[-1]['fn'] - pts[-2]['fn'])
if (error < tolerance):
break
return pts
def f(x):
global evals
evals += 1
return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2
def df(x):
global evals
evals += 2
return np.array([-2*(1 - x[0]) - 400*(x[1] - x[0]**2)*x[0], 200*(x[1] - x[0]**2)])
dim = 2
start = -1*np.ones(dim)
start_step = 0.1
tolerance = 1e-6
pts = conjugate_gradient(f, df, start, start_step, tolerance)
plot_size = 100
xy_min = -1.1
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
x = []
y = []
z = []
for i in range(len(pts)):
x.append(pts[i]['pt'][0])
y.append(pts[i]['pt'][1])
z.append(pts[i]['fn'])
point = ''
for i in range(len(start)):
point += '%.3f '%pts[-1]['pt'][i]
title = '2D Rosenbrock conjugate gradient search'\
+ '\nfinal point: ' + point\
+ '\ntolerance: ' + str(tolerance)\
+ '\niterations: ' + str(len(pts))\
+ ' function evaluations: ' + str(evals)
plt.ion()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_proj_type('ortho')
ax.plot_surface(X, Y, Z)
ax.view_init(35, 59)
ax.plot(x, y, z, color='black')
plt.title(title)
plt.show()
plot_size = 100
xy_min = -1.5
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
def f(x):
global evals, dim
evals += 1
sum = 0
for i in range(1, dim):
sum += (1 - x[i-1])**2 + 100*(x[i] - x[i-1]**2)**2
return sum
def df(x):
global evals, dim
evals += dim
d = np.zeros(dim)
d[0] = -2*(1 - x[0]) - 400*(x[1] - x[0]**2)*x[0]
d[dim-1] = 200*(x[dim-1] - x[dim-2]**2)
for i in range(1, dim-1):
d[i] = -2*(1 - x[i]) - 400*(x[i+1] - x[i]**2)*x[i] + 200*(x[i] - x[i-1]**2)
return d
dim = 10
start = -1*np.ones(dim)
start_step = 0.1
tolerance = 1e-6
pts = conjugate_gradient(f, df, start, start_step, tolerance)
point = ''
for i in range(len(start)):
point += '%.3f '%pts[-1]['pt'][i]
title = '10D Rosenbrock conjugate gradient search'\
+ '\nfinal point: ' + point\
+ '\ntolerance: ' + str(tolerance)\
+ '\niterations: ' + str(len(pts))\
+ ' function evaluations: ' + str(evals)
x = []
y = []
for i in range(len(pts)):
x.append(pts[i]['pt'][0])
y.append(pts[i]['pt'][1])
fig, ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=0.5)
ax.plot(x, y, color='white')
plt.title(title)
plt.show()
(f) Repeat and compare the 2D and 10D Rosenbrock search with the CMA-ES method.
# From Neil's cma.py code
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib import cm
import copy as cp
evals = 0
def nm(f, mean, var, filter, momentum, tolerance, pop_size):
global evals
evals = 0
d = mean.size
last_mean = np.copy(mean)
last_last_mean = np.copy(last_mean)
cov = np.diag(var)
pop = []
f_max = -float('inf')
for i in range(pop_size):
x = np.random.multivariate_normal(mean,cov)
f_n = f(x)
pop.append({'pt':x,'fn':f_n})
pop.sort(key=lambda item:item['fn'])
pops = []
pops.append(pop)
weights = np.zeros(pop_size)
for i in range(pop_size // 2):
weights[i] = (pop_size // 2) - i
weights /= np.sum(weights)
while True:
new_mean = np.zeros(d)
for i in range(pop_size):
new_mean += pop[i]['pt']*weights[i]
new_cov = np.zeros((d,d))
for i in range(pop_size):
new_cov += np.outer(pop[i]['pt'] - mean, pop[i]['pt'] - mean)*weights[i]
last_last_mean = np.copy(last_mean)
last_mean = np.copy(mean)
mean = (1 - filter)*mean + filter*new_mean + momentum*(last_mean - last_last_mean)
cov = (1 - filter)*cov + filter*new_cov
pop = []
for i in range(pop_size):
x = np.random.multivariate_normal(mean,cov)
f_n = f(x)
pop.append({'pt':x,'fn':f_n})
pop.sort(key=lambda item:item['fn'])
pops.append(pop)
error = abs(pops[-1][0]['fn'] - pops[-2][0]['fn'])
if (error < tolerance):
break
return(pops)
def f(x):
global evals
evals += 1
return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2
dim = 2
mean = -1*np.ones(dim)
std = 0.25
var = std**2*np.ones(dim)
filter = 0.1
momentum = 0.9
tolerance = 1e-6
pop_size = 3
pops = nm(f, mean, var, filter, momentum, tolerance, pop_size)
plot_size = 100
xymin = -1.1
xymax = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
x_s = []
y_s = []
for i in range(len(pops)):
for j in range(len(pops[i])):
x_s.append(pops[i][j]['pt'][0])
y_s.append(pops[i][j]['pt'][1])
point = ''
for i in range(len(mean)):
point += '%.3f '%pops[-1][0]['pt'][i]
title = '2D Rosenbrock CMA-ES search (with momentum)'\
+'\nfinal point: '+point\
+'\ntolerance: '+str(tolerance)\
+'\niterations: '+str(len(pops))\
+' function evaluations: '+str(evals)
plt.ion()
fig,ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=.5)
colors = np.arange(len(xs))/len(xs)
ax.scatter(xs, ys, 2, colors)
plt.xlim(xy_min, xy_max)
plt.ylim(xy_min, xy_max)
plt.title(title)
plt.show()
dim = 2
mean = -1*np.ones(dim)
std = 0.25
var = std**2*np.ones(dim)
filter = 0.1
momentum = 0
tolerance = 1e-6
pop_size = 3
pops = nm(f, mean, var, filter, momentum, tolerance, pop_size)
x_s = []
y_s = []
for i in range(len(pops)):
for j in range(len(pops[i])):
x_s.append(pops[i][j]['pt'][0])
y_s.append(pops[i][j]['pt'][1])
point = ''
for i in range(len(mean)):
point += '%.3f '%pops[-1][0]['pt'][i]
title = '2D Rosenbrock CMA-ES search (no momentum)'\
+'\nfinal point: '+point\
+'\ntolerance : '+str(tolerance)\
+'\niterations: '+str(len(pops))\
+' function evaluations: '+str(evals)
plt.ion()
fig,ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=.5)
colors = np.arange(len(xs))/len(xs)
ax.scatter(xs, ys, 2, colors)
plt.xlim(xy_min,xy_max)
plt.ylim(xy_min,xy_max)
plt.title(title)
plt.show()
plotsize = 100
xymin = -1.5
xymax = 1.3
x = np.arange(xy_min,xy_max,(xy_max-xy_min)/100)
y = np.arange(xy_min,xy_max,(xy_max-xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])
def f(x):
global evals, dim
evals += 1
sum = 0
for i in range(1, dim):
sum += (1 - x[i-1])**2 + 100*(x[i] - x[i-1]**2)**2
return sum
dim = 10
mean = -1*np.ones(dim)
std = 0.25
var = std**2*np.ones(dim)
filter = 0.1
momentum = 0.9
tolerance = 1e-6
pop_size = 25
pops = nm(f, mean, var, filter, momentum, tolerance, pop_size)
x_s = []
y_s = []
for i in range(len(pops)):
for j in range(len(pops[i])):
x_s.append(pops[i][j]['pt'][0])
y_s.append(pops[i][j]['pt'][1])
point = ''
for i in range(len(mean)):
point += '%.3f '%pops[-1][0]['pt'][i]
title = '10D Rosenbrock CMA-ES search (with momentum)'\
+'\nfinal point: '+point\
+'\ntolerance : '+str(tolerance)\
+'\niterations: '+str(len(pops))\
+' function evaluations: '+str(evals)
plt.ion()
fig,ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=.5)
colors = np.arange(len(xs))/len(xs)
ax.scatter(xs, ys, 2, colors)
plt.xlim(xymin,xymax)
plt.ylim(xymin,xymax)
plt.title(title)
plt.show()
(g) Explore how these algorithms depend on parameters such as momentum, temperature, and dimension.
First, we'll look at the effects of changing dimensions (note that momentum was set to 0 for relevant methods, and every method had the same parameters):
Method | Dimension | Iterations | Function Evaluations |
---|---|---|---|
Nelder-Mead | 5 | 445 | 722 |
Nelder-Mead | 10 | 3519 | 4838 |
Nelder-Mead | 15 | 2952 | 3863 |
Gradient Descent | 5 | 1250 | 15529 |
Gradient Descent | 10 | 1521 | 33922 |
Gradient Descent | 15 | 2226 | 71617 |
Conjugate Gradient | 5 | 76 | 1693 |
Conjugate Gradient | 10 | 102 | 2839 |
Conjugate Gradient | 15 | 143 | 4647 |
CMA-ES | 5 | 699 | 17475 |
CMA-ES | 10 | 1405 | 35125 |
CMA-ES | 15 | 1623 | 40575 |
As you can see, it depends on the dimension, as well as what is more important, iterations or function evaluations, to determine which method is the "best" for a given task. Almost all of the methods take more time for higher dimensions, with the exception of Nelder-Mead. In the Nelder-Mead method, there's a certain point where iterations and function evaluations go down instead of up with higher dimensions. Gradient descent's performance time grows exponentially with dimension, which CMA-ES performance time still grow, but at a decaying rate. The conjugate gradient method seems to be the most linear of the four methods in terms of performance time growing as dimensions are added.
We will now look at the performance of the gradient descent and CMA-ES methods when factoring in momentum. All parameters were equal, and the two methods were tested at 10 dimensions:
Method | Momentum | Iterations | Function Evaluations |
---|---|---|---|
Gradient Descent | 0.1 | 1243 | 27806 |
Gradient Descent | 0.5 | 1394 | 31128 |
Gradient Descent | 0.9 | 782 | 17664 |
CMA-ES | 0.1 | 1626 | 40650 |
CMA-ES | 0.5 | 1928 | 48200 |
CMA-ES | 0.9 | 550 | 13750 |
It seems that momentum isn't always helpful for the performance of a given method. For the gradient descent method, any momentum did decrease both the iterations and function evaluations, but not for CMA-ES. For CMA-ES, momentum only decreased the iterations and function evaluations when it reached higher values. With this being said, it seemed like a little momentum helped more than half-momentum, but both were much worse than almost full momentum (value of 0.9).