import numpy as np
from matplotlib import rc
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from mpl_toolkits import mplot3d
import matplotlib.cm as cm
rc('animation', html='html5')
sns.set(style='whitegrid', rc={'figure.figsize':(10,8)})
def rosenbrock_2d(x, y):
return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
def plot_rosenbrock(lim):
x = np.linspace(-lim, lim, 50)
y = np.linspace(-lim, lim, 50)
X, Y = np.meshgrid(x, y)
Z = rosenbrock_2d(X, Y)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
cmap='viridis', edgecolor='none')
ax.set_title('Rosenbrock over [-%.1f, %.1f]x[-%.1f, %.1f]' % (lim, lim, lim, lim), y=1.05)
ax.view_init(35, 35)
return fig, ax
plot_rosenbrock(1)
plot_rosenbrock(5)
plot_rosenbrock(0.2)
First, let's define a general rosenbrock:
def rosenbrock(x):
s = 0
for i in range(1, len(x)):
s += (1 - x[i - 1]) ** 2 + 100 * (x[i] - x[i - 1] ** 2) ** 2
return s
Now a function that gets a point and returns a simplex. My simplex structure is a list of tuples, the first element of the tuples is the function evaluated on that point and the second element is the point itself (this will save us some function evaluations)
# given a D dimension point x, returns a simplex containing x with unit vector edges
def init_simplex(x_init, f):
xs = [x_init]
for i in range(len(x_init)):
x_new = x_init.copy()
x_new[i] += 1
xs.append(x_new)
# sort points according to f
simplex = [(f(x), x) for x in xs]
simplex.sort(key=lambda x: x[0], reverse=True)
return simplex
# inserts a value and point x into the simplex in the correct position and removes the first element
def insert_to_simplex(simplex, x, val):
i = 0
while i < len(simplex):
if simplex[i][0] < val:
break
i += 1
if i > 1:
return simplex[1:i] + [(val, x)] + simplex[i:]
else:
return [(val, x)] + simplex[1:]
def simplex_step(simplex, f):
eval_count = 0
x_mean = np.mean([x for _, x in simplex[1:]], axis=0)
# reflect
x_reflect = 2 * x_mean - simplex[0][1]
val_reflect = f(x_reflect)
eval_count += 1
if val_reflect < simplex[-1][0]:
# reflect is a good move, try grow
x_grow = 3 * x_mean - 2 * simplex[0][1]
val_grow = f(x_grow)
eval_count += 1
if val_grow < val_reflect:
# grow is even better
simplex.append((val_grow, x_grow))
else:
simplex.append((val_reflect, x_reflect))
# remove the worst point
simplex.pop(0)
elif val_reflect > simplex[0][0]:
# we overshot the minimium, try reflect-shrink
x_reflect_shrink = (3 * x_mean - simplex[0][1]) / 2
val_reflect_shrink = f(x_reflect_shrink)
eval_count += 1
if val_reflect_shrink < simplex[0][0]:
# good, accept it
simplex = insert_to_simplex(simplex, x_reflect_shrink, val_reflect_shrink)
else:
# not good, try just shrink
x_shrink = (x_mean + simplex[0][1]) / 2
val_shrink = f(x_shrink)
eval_count += 1
if val_shrink < simplex[0][0]:
# good, accept it
simplex = insert_to_simplex(simplex, x_shrink, val_shrink)
else:
# damn, shrink everything
for i in range(len(simplex) - 1):
x_new = (simplex[i][1] + simplex[-1][1]) / 2
val_new = f(x_new)
eval_count += 1
simplex[i] = (val_new, x_new)
else:
# reflect is good (but not best), keep it
simplex = insert_to_simplex(simplex, x_reflect, val_reflect)
return simplex, eval_count
def plot_simplex_3d(simplex, ax):
x = [p[0] for (_, p) in simplex]
y = [p[1] for (_, p) in simplex]
z = [z for (z, _) in simplex]
#add the first point
x.append(simplex[0][1][0])
y.append(simplex[0][1][1])
z.append(simplex[0][0])
ax.plot3D(x, y, z, 'red')
steps_total = 0
evals_total = 0
x_init = np.array([-1, -1])
# init simplex
simplex = init_simplex(x_init, rosenbrock)
diff = simplex[0][0] - simplex[-1][0]
# collect all simplexes to animate later
all_simplexes = [simplex.copy()]
# stopping condition, when the diff between the largest and smallest point of th simplex is smaller than
while diff > 1e-4:
simplex, evals = simplex_step(simplex, rosenbrock)
steps_total += 1
evals_total += evals
diff = simplex[0][0] - simplex[-1][0]
all_simplexes.append(simplex.copy())
print("Best point:", simplex[-1][1])
print("Value:", simplex[-1][0])
print("Total Steps:", steps_total)
print("Total Evaluations:", evals_total)
fig, ax = plot_rosenbrock(1)
ax.set_title("Rosenbrock - Downhill Simplex", y=1.1)
def update_anim(i):
plot_simplex_3d(all_simplexes[i], ax)
animation.FuncAnimation(fig, update_anim, range(steps_total), blit=False, interval=200, repeat=True)
D = 10
steps_total = 0
evals_total = 0
x_init = -np.ones(D)
# init simplex
simplex = init_simplex(x_init, rosenbrock)
diff = simplex[0][0] - simplex[-1][0]
# collect all simplexes to animate later
all_simplexes = [simplex.copy()]
# stopping condition, when the diff between the largest and smallest point of th simplex is smaller than
while diff > 1e-4:
simplex, evals = simplex_step(simplex, rosenbrock)
steps_total += 1
evals_total += evals
diff = simplex[0][0] - simplex[-1][0]
all_simplexes.append(simplex.copy())
print("Best point:", simplex[-1][1])
print("Value:", simplex[-1][0])
print("Total Steps:", steps_total)
print("Total Evaluations:", evals_total)
# takes too long, not worth it
# frames = 1000
# cmap = sns.color_palette("coolwarm", frames + 1)
# seq_colors = [cmap[i] for i in range(frames + 1)]
# fig = plt.figure(figsize=(12, 8))
# axes = fig.subplots(nrows=2, ncols=3)
# for i in range(2):
# for j in range(3):
# axes[i][j].set_xlabel("X[%d]" % (i * 6 + j * 2))
# axes[i][j].set_ylabel("X[%d]" % (i * 6 + j * 2 + 1))
# axes[i][j].set_xlim(-1, 1)
# axes[i][j].set_ylim(-1, 1)
# axes[1][2].remove()
# fig.tight_layout()
# def update_anim(k):
# for i in range(2):
# for j in range(3):
# idx = i * 6 + j * 2
# if idx > 9:
# continue
# axes[i][j].plot(all_simplexes[k][-1][1][idx],
# all_simplexes[k][-1][1][idx+1],
# 'o',
# c=seq_colors[k])
# animation.FuncAnimation(fig, update_anim, range(frames), blit=False, interval=30, repeat=True)
# given a function, start point and direction - find the minima on the line
def line_minimize(f, x0, d):
a = 1
x1 = x0 + a * d
b = 2 * a / (1 + np.sqrt(5))
x2 = x1 + b * d
eval_count = 3
fx0 = f(x0)
fx1 = f(x1)
fx2 = f(x2)
# if x1 is the max of the triplet, choose a direction by the min(x0, x2)
if fx1 > fx0 and fx1 > fx2:
if fx0 > fx2:
x0, x1 = x1, x2
fx0, fx1 = fx1, fx2
a = b
else:
x0, x1 = x1, x0
fx0, fx1 = fx1, fx0
a = -a
b = 2 * a / (1 + np.sqrt(5))
x2 = x1 + b * d
fx2 = f(x2)
eval_count += 1
# now, if he triplet are all descending, move along the line until x1 is *between* x0 and x2
while not (fx1 < fx0 and fx1 < fx2):
if fx0 > fx2:
x1 = x2
fx1 = fx2
a = a + b
else:
x0, x1 = x2, x0
fx0, fx1 = fx2, fx0
a = -(a + b)
b = 2 * a / (1 + np.sqrt(5))
x2 = x1 + b * d
fx2 = f(x2)
eval_count += 1
# we now have x1 between x0 and x2
# now drill down until the bracket values are almost the same
while abs(fx0 - fx2) > 1e-3:
c = 2 * b / (1 + np.sqrt(5))
x3 = x1 - c * d
fx3 = f(x3)
eval_count += 1
if fx3 < fx1:
x1, x2 = x3, x1
fx1, fx2 = fx3, fx1
a = a - c
b = c
else:
x0, x2 = x2, x3
fx0, fx2 = fx2, fx3
a = -b
b = -c
return x1, fx1, eval_count
# sanity check
x0 = np.array([1, -1])
d0 = np.array([0, 1])
line_minimize(rosenbrock, x0, d0)
Let's calculate the gradient of the Rosebrock function:
$$ \frac{\partial f}{\partial x_0} = -2(1 - x_0) - 400 x_0 (x_1 - x_0^2) $$
$$ \frac{\partial f}{\partial x_{D - 1}} = 200(x_{D - 1} - x_{D - 2}^2) $$
$$ \forall 0 < i < D - 1 : \frac{\partial f}{\partial x_i} = -2(1 - x_i) - 400 x_i (x_{i + 1} - x_i^2) + 200(x_i - x_{i - 1}^2) $$
def rosenbrock_grad(x):
g = np.zeros(len(x))
g[0] = -2 * (1 - x[0]) - 400 * x[0] * (x[1] - x[0] ** 2)
g[-1] = 200 * (x[-1] - x[-2] ** 2)
for i in range(1, len(x) - 1):
g[i] = -2 * (1 - x[i]) - 400 * x[i] * (x[i + 1] - x[i] ** 2) + 200 * (x[i] - x[i - 1] ** 2)
return g
def polak_ribiere(f, x, d):
eval_count = 1
lines_count = 0
fx = f(x)
g = rosenbrock_grad(x)
diff = 1000 # just some big number
# save for plotting
all_lines = [(x, fx)]
while diff > 1e-4:
x_new, fx_new, evals = line_minimize(f, x, d)
all_lines.append((x_new, fx_new))
eval_count += evals
g_new = rosenbrock_grad(x_new)
gamma = np.dot(g_new, g_new - g) / np.dot(g, g)
d = g_new + gamma * d
g = g_new
x = x_new
diff = abs(fx_new - fx)
fx = fx_new
return x, fx, eval_count, all_lines
x0 = np.array([-1, -1])
d0 = np.array([1, 0.1])
x, fx, evals, lines = polak_ribiere(rosenbrock, x0, d0)
print("Best point:", x)
print("Value:", fx)
print("Total Lines:", len(lines))
print("Total Evaluations:", evals)
fig, ax = plot_rosenbrock(1)
ax.set_title("Rosenbrock - Conjugate Gradient", y=1.1)
def update_anim(i):
if i > 0:
x = [lines[i-1][0][0], lines[i][0][0]]
y = [lines[i-1][0][1], lines[i][0][1]]
z = [lines[i-1][1], lines[i][1]]
ax.plot3D(x, y, z, 'red')
animation.FuncAnimation(fig, update_anim, range(len(lines)), blit=False, interval=300, repeat=True)
x0 = np.array([-1, -1])
d0 = np.array([0.1, 1])
x, fx, evals, lines = polak_ribiere(rosenbrock, x0, d0)
print("Best point:", x)
print("Value:", fx)
print("Total Lines:", len(lines))
print("Total Evaluations:", evals)
fig, ax = plot_rosenbrock(1)
ax.set_title("Rosenbrock - Conjugate Gradient", y=1.1)
def update_anim(i):
if i > 0:
x = [lines[i-1][0][0], lines[i][0][0]]
y = [lines[i-1][0][1], lines[i][0][1]]
z = [lines[i-1][1], lines[i][1]]
ax.plot3D(x, y, z, 'red')
animation.FuncAnimation(fig, update_anim, range(len(lines)), blit=False, interval=300, repeat=True)
So according to Wikipedia, a smart idea is to start with the direction of the steepest descent, then every $D $iterations to reset direction $d$ to the steepest gradient at that point. Apparently the direction loses conjugacy after D iterations (or less)
If after a reset we have no progress, we're done.
def polak_ribiere(f, x, d):
eval_count = 1
lines_count = 0
fx = f(x)
g = rosenbrock_grad(x)
# save for plotting
all_lines = [(x, fx)]
i = 0
while True:
x_new, fx_new, evals = line_minimize(f, x, d)
if i == 0 and abs(fx - fx_new) < 1e-4:
return x, fx, eval_count, all_lines
all_lines.append((x_new, fx_new))
eval_count += evals
g_new = rosenbrock_grad(x_new)
i += 1
if i < max(10, len(x)): # if D is small (2), we reset too often and this algo becomes steepest descent
gamma = np.dot(g_new, g_new - g) / np.dot(g, g)
d = g_new + gamma * d
g = g_new
x = x_new
fx = fx_new
else:
# RESET
d = -g_new
g = g_new
x = x_new
fx = fx_new
i = 0
x0 = np.array([-1, -1])
d0 = np.array([0.1, 1])
x, fx, evals, lines = polak_ribiere(rosenbrock, x0, d0)
print("Best point:", x)
print("Value:", fx)
print("Total Lines:", len(lines))
print("Total Evaluations:", evals)
fig, ax = plot_rosenbrock(1)
ax.set_title("Rosenbrock - Conjugate Gradient", y=1.1)
def update_anim(i):
if i > 0:
x = [lines[i-1][0][0], lines[i][0][0]]
y = [lines[i-1][0][1], lines[i][0][1]]
z = [lines[i-1][1], lines[i][1]]
ax.plot3D(x, y, z, 'red')
animation.FuncAnimation(fig, update_anim, range(len(lines)), blit=False, interval=300, repeat=True)
D = 10
x0 = -np.ones(D)
d0 = -rosenbrock_grad(x0) # steepest descent
x, fx, evals, lines = polak_ribiere(rosenbrock, x0, d0)
print("Best point:", x)
print("Value:", fx)
print("Total Lines:", len(lines))
print("Total Evaluations:", evals)
A good clear overview of the algorithm - The CMA Evolution Strategy: A Tutorial (Hansen 2016)
def cma_es(f, x0, sigma, population_size, exp_filter=0.5, iterations=100):
eval_count = 0
N = len(x0)
mean = x0
cov = np.identity(N)
# for plotting
all_dists = [(mean, cov)]
for j in range(iterations):
# sample population from a multivariate distribution
population = []
for i in range(population_size):
x = np.random.multivariate_normal(mean, cov * sigma)
fx = f(x)
eval_count += 1
population.append((fx, x))
# now we need to use the fx's (fitness) to generate weights for each sample in the population
# first, let's sort the population according to fitness
population.sort(key = lambda x : x[0])
# according to the tutorial it's a good idea to consider half of the population
# so half of the weights will be zero
# now I played around with two weight scheme, one where they all are equal:
# A) equal weights
weights = np.zeros(population_size)
for i in range(population_size // 2):
weights[i] = 1
weights /= np.sum(weights)
# Update: This works fine for 2D, but doesn't converge for 10D!
# B) descending weights:
weights = np.zeros(population_size)
for i in range(population_size // 2):
weights[i] = (population_size // 2) - i
weights /= np.sum(weights)
# Calculate the new weighted mean
mean_new = np.zeros(N)
for i in range(population_size // 2):
mean_new += weights[i] * population[i][1]
# now we need to update the covariance matrix
# there seems to be a lot of ways to do this
# i'm going to try and keep it simple and apply equaton (12) from page 10 in the tutorial referenced above
cov_new = np.zeros((N, N))
for i in range(population_size // 2):
v = population[i][1] - mean
cov_new = np.add(cov_new, weights[i] * np.outer(v, v))
# update params using exponential filter
mean = exp_filter * mean + (1 - exp_filter) * mean_new
cov = exp_filter * cov + (1 - exp_filter) * cov_new
# when to stop??
all_dists.append((mean, cov))
return mean, eval_count, all_dists
x0 = np.array([-1, -1])
sigma = 2 # step size
population_size = 20
iterations = 500
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(35, 35)
ax.set_title("Rosenbrock - CMA-ES (Population: %d, Step: %d, Iter: %d)" % (population_size, sigma, iterations), y=1.1)
xdata = [dist[0][0] for dist in dists]
ydata = [dist[0][1] for dist in dists]
zdata = [rosenbrock(dist[0]) for dist in dists]
ax.scatter3D(xdata, ydata, zdata, c='red', s=30)
plt.show()
x0 = np.array([-1, -1])
sigma = 2 # step size
population_size = 5
iterations = 500
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(35, 35)
ax.set_title("Rosenbrock - CMA-ES (Population: %d, Step: %d, Iter: %d)" % (population_size, sigma, iterations), y=1.1)
xdata = [dist[0][0] for dist in dists]
ydata = [dist[0][1] for dist in dists]
zdata = [rosenbrock(dist[0]) for dist in dists]
ax.scatter3D(xdata, ydata, zdata, c='red', s=30)
plt.show()
x0 = np.array([-1, -1])
sigma = 1 # step size
population_size = 20
iterations = 500
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(35, 35)
ax.set_title("Rosenbrock - CMA-ES (Population: %d, Step: %d, Iter: %d)" % (population_size, sigma, iterations), y=1.1)
xdata = [dist[0][0] for dist in dists]
ydata = [dist[0][1] for dist in dists]
zdata = [rosenbrock(dist[0]) for dist in dists]
ax.scatter3D(xdata, ydata, zdata, c='red', s=30)
plt.show()
x0 = np.array([-1, -1])
sigma = 5 # step size
population_size = 20
iterations = 500
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(35, 35)
ax.set_title("Rosenbrock - CMA-ES (Population: %d, Step: %d, Iter: %d)" % (population_size, sigma, iterations), y=1.1)
xdata = [dist[0][0] for dist in dists]
ydata = [dist[0][1] for dist in dists]
zdata = [rosenbrock(dist[0]) for dist in dists]
ax.scatter3D(xdata, ydata, zdata, c='red', s=30)
plt.show()
D = 10
x0 = -np.ones(D)
sigma = 2 # step size
population_size = 20
iterations = 500
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
D = 10
x0 = -np.ones(D)
sigma = 1 # step size
population_size = 20
iterations = 500
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
D = 10
x0 = -np.ones(D)
sigma = 1 # step size
population_size = 20
iterations = 3000
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations, exp_filter=0.9)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
D = 10
x0 = -np.ones(D)
sigma = 1 # step size
population_size = 20
iterations = 10000
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations, exp_filter=0.9)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
D = 10
x0 = -np.ones(D)
sigma = 1 # step size
population_size = 100
iterations = 3000
x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations, exp_filter=0.9)
print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()