In [133]:
import numpy as np
from matplotlib import rc
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from mpl_toolkits import mplot3d
import matplotlib.cm as cm
rc('animation', html='html5')
sns.set(style='whitegrid', rc={'figure.figsize':(10,8)})

(15.1)

In [41]:
def rosenbrock_2d(x, y):
    return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2

(a) Plot the Rosenbrock function

In [87]:
def plot_rosenbrock(lim):
    x = np.linspace(-lim, lim, 50)
    y = np.linspace(-lim, lim, 50)

    X, Y = np.meshgrid(x, y)
    Z = rosenbrock_2d(X, Y)

    fig = plt.figure()
    ax = plt.axes(projection='3d')
    ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
                    cmap='viridis', edgecolor='none')
    
    ax.set_title('Rosenbrock over [-%.1f, %.1f]x[-%.1f, %.1f]' % (lim, lim, lim, lim), y=1.05)

    ax.view_init(35, 35)
    
    return fig, ax
In [88]:
plot_rosenbrock(1)
Out[88]:
(<Figure size 720x576 with 1 Axes>,
 <matplotlib.axes._subplots.Axes3DSubplot at 0x7fd935acff98>)
In [89]:
plot_rosenbrock(5)
Out[89]:
(<Figure size 720x576 with 1 Axes>,
 <matplotlib.axes._subplots.Axes3DSubplot at 0x7fd935eb3198>)
In [90]:
plot_rosenbrock(0.2)
Out[90]:
(<Figure size 720x576 with 1 Axes>,
 <matplotlib.axes._subplots.Axes3DSubplot at 0x7fd9354ce668>)

(b) Downhill Simplex

First, let's define a general rosenbrock:

In [46]:
def rosenbrock(x):
    s = 0
    for i in range(1, len(x)):
        s += (1 - x[i - 1]) ** 2 + 100 * (x[i] - x[i  - 1] ** 2) ** 2
    return s

Now a function that gets a point and returns a simplex. My simplex structure is a list of tuples, the first element of the tuples is the function evaluated on that point and the second element is the point itself (this will save us some function evaluations)

In [101]:
# given a D dimension point x, returns a simplex containing x with unit vector edges 
def init_simplex(x_init, f):
    xs = [x_init]
    for i in range(len(x_init)):
        x_new = x_init.copy()
        x_new[i] += 1
        xs.append(x_new)
    # sort points according to f
    simplex = [(f(x), x) for x in xs]
    simplex.sort(key=lambda x: x[0], reverse=True)
    return simplex

# inserts a value and point x into the simplex in the correct position and removes the first element
def insert_to_simplex(simplex, x, val):
    i = 0
    while i < len(simplex):
        if simplex[i][0] < val:
            break
        i += 1
    if i > 1:
        return simplex[1:i] + [(val, x)] + simplex[i:]
    else:
        return [(val, x)] + simplex[1:]

def simplex_step(simplex, f):
    eval_count = 0
    
    x_mean = np.mean([x for _, x in simplex[1:]], axis=0)
    
    # reflect
    x_reflect = 2 * x_mean - simplex[0][1]
    val_reflect = f(x_reflect)
    eval_count += 1
    
    if val_reflect < simplex[-1][0]:
        # reflect is a good move, try grow
        x_grow = 3 * x_mean - 2 * simplex[0][1]
        val_grow = f(x_grow)
        eval_count += 1
        
        if val_grow < val_reflect:
            # grow is even better
            simplex.append((val_grow, x_grow))
        else:
            simplex.append((val_reflect, x_reflect))
            
        # remove the worst point
        simplex.pop(0)
    elif val_reflect > simplex[0][0]:
        # we overshot the minimium, try reflect-shrink
        x_reflect_shrink = (3 * x_mean - simplex[0][1]) / 2
        val_reflect_shrink = f(x_reflect_shrink)
        eval_count += 1
        
        if val_reflect_shrink < simplex[0][0]:
            # good, accept it
            simplex = insert_to_simplex(simplex, x_reflect_shrink, val_reflect_shrink)
        else:
            # not good, try just shrink
            x_shrink = (x_mean + simplex[0][1]) / 2
            val_shrink = f(x_shrink)
            eval_count += 1
            
            if val_shrink < simplex[0][0]:
                # good, accept it
                simplex = insert_to_simplex(simplex, x_shrink, val_shrink)
            else:
                # damn, shrink everything
                for i in range(len(simplex) - 1):
                    x_new = (simplex[i][1] + simplex[-1][1]) / 2
                    val_new = f(x_new)
                    eval_count += 1
                    
                    simplex[i] = (val_new, x_new)
    else:
        # reflect is good (but not best), keep it
        simplex = insert_to_simplex(simplex, x_reflect, val_reflect)
    
    return simplex, eval_count
In [81]:
def plot_simplex_3d(simplex, ax):
    x = [p[0] for (_, p) in simplex]
    y = [p[1] for (_, p) in simplex]
    z = [z for (z, _) in simplex]
    #add the first point
    x.append(simplex[0][1][0])
    y.append(simplex[0][1][1])
    z.append(simplex[0][0])
    
    ax.plot3D(x, y, z, 'red')
In [92]:
steps_total = 0
evals_total = 0

x_init = np.array([-1, -1])

# init simplex
simplex = init_simplex(x_init, rosenbrock)
diff = simplex[0][0] - simplex[-1][0]

# collect all simplexes to animate later
all_simplexes = [simplex.copy()]

# stopping condition, when the diff between the largest and smallest point of th simplex is smaller than
while diff > 1e-4:
    simplex, evals = simplex_step(simplex, rosenbrock)
    
    steps_total += 1
    evals_total += evals
    
    diff = simplex[0][0] - simplex[-1][0]
    
    all_simplexes.append(simplex.copy())
    
print("Best point:", simplex[-1][1])
print("Value:", simplex[-1][0])
print("Total Steps:", steps_total)
print("Total Evaluations:", evals_total)
Best point: [0.99947491 0.99938512]
Value: 1.9200569960447368e-05
Total Steps: 50
Total Evaluations: 94
In [93]:
fig, ax = plot_rosenbrock(1)
ax.set_title("Rosenbrock - Downhill Simplex", y=1.1)

def update_anim(i):
    plot_simplex_3d(all_simplexes[i], ax)

animation.FuncAnimation(fig, update_anim, range(steps_total), blit=False, interval=200, repeat=True)
Out[93]:

(c) 10 dimensional Rosenbrock using Downhill Simplex

In [359]:
D = 10

steps_total = 0
evals_total = 0

x_init = -np.ones(D)

# init simplex
simplex = init_simplex(x_init, rosenbrock)
diff = simplex[0][0] - simplex[-1][0]

# collect all simplexes to animate later
all_simplexes = [simplex.copy()]

# stopping condition, when the diff between the largest and smallest point of th simplex is smaller than
while diff > 1e-4:
    simplex, evals = simplex_step(simplex, rosenbrock)
    
    steps_total += 1
    evals_total += evals
    
    diff = simplex[0][0] - simplex[-1][0]
    
    all_simplexes.append(simplex.copy())
    
print("Best point:", simplex[-1][1])
print("Value:", simplex[-1][0])
print("Total Steps:", steps_total)
print("Total Evaluations:", evals_total)
Best point: [1.00041312 1.00045738 0.99946566 0.99944602 0.9990324  0.99836958
 0.99736053 0.99465021 0.98888266 0.97818427]
Value: 0.0004926127996431997
Total Steps: 2045
Total Evaluations: 2790
In [ ]:
# takes too long, not worth it

# frames = 1000

# cmap = sns.color_palette("coolwarm", frames + 1)
# seq_colors = [cmap[i] for i in range(frames + 1)]

# fig = plt.figure(figsize=(12, 8))  
# axes = fig.subplots(nrows=2, ncols=3)
# for i in range(2):
#     for j in range(3):
#         axes[i][j].set_xlabel("X[%d]" % (i * 6 + j * 2))
#         axes[i][j].set_ylabel("X[%d]" % (i * 6 + j * 2 + 1))
#         axes[i][j].set_xlim(-1, 1)
#         axes[i][j].set_ylim(-1, 1)
# axes[1][2].remove()
# fig.tight_layout()

# def update_anim(k):
#     for i in range(2):
#         for j in range(3):
#             idx = i * 6 + j * 2
#             if idx > 9:
#                 continue
#             axes[i][j].plot(all_simplexes[k][-1][1][idx], 
#                             all_simplexes[k][-1][1][idx+1], 
#                             'o', 
#                             c=seq_colors[k])

# animation.FuncAnimation(fig, update_anim, range(frames), blit=False, interval=30, repeat=True)

(d) Conjugate Gradient (Polak-Ribiere)

In [163]:
# given a function, start point and direction - find the minima on the line
def line_minimize(f, x0, d):
    a = 1
    x1 = x0 + a * d
    b = 2 * a / (1 + np.sqrt(5))
    x2 = x1 + b * d
    
    eval_count = 3
    fx0 = f(x0)
    fx1 = f(x1)
    fx2 = f(x2)
    
    # if x1 is the max of the triplet, choose a direction by the min(x0, x2)
    if fx1 > fx0 and fx1 > fx2:
        if fx0 > fx2:
            x0, x1 = x1, x2
            fx0, fx1 = fx1, fx2
            a = b
        else:
            x0, x1 = x1, x0
            fx0, fx1 = fx1, fx0
            a = -a
        b = 2 * a / (1 + np.sqrt(5))
        x2 = x1 + b * d
        fx2 = f(x2)
        eval_count += 1
             
    # now, if he triplet are all descending, move along the line until x1 is *between* x0 and x2
    while not (fx1 < fx0 and fx1 < fx2):
        if fx0 > fx2:
            x1 = x2
            fx1 = fx2
            a = a + b
        else:
            x0, x1 = x2, x0
            fx0, fx1 = fx2, fx0
            a = -(a + b)
            
        b = 2 * a / (1 + np.sqrt(5))
        x2 = x1 + b * d
        fx2 = f(x2)
        eval_count += 1
        
    # we now have x1 between x0 and x2
    # now drill down until the bracket values are almost the same
    while abs(fx0 - fx2) > 1e-3:
        c = 2 * b / (1 + np.sqrt(5))
        x3 = x1 - c * d
        fx3 = f(x3)
        eval_count += 1
        
        if fx3 < fx1:
            x1, x2 = x3, x1
            fx1, fx2 = fx3, fx1
            a = a - c
            b = c
        else:
            x0, x2 = x2, x3
            fx0, fx2 = fx2, fx3
            a = -b
            b = -c
    
    return x1, fx1, eval_count
In [167]:
# sanity check

x0 = np.array([1, -1])
d0 = np.array([0, 1])

line_minimize(rosenbrock, x0, d0)
Out[167]:
(array([1., 1.]), 1.232595164407831e-30, 17)

Let's calculate the gradient of the Rosebrock function:

$$ \frac{\partial f}{\partial x_0} = -2(1 - x_0) - 400 x_0 (x_1 - x_0^2) $$

$$ \frac{\partial f}{\partial x_{D - 1}} = 200(x_{D - 1} - x_{D - 2}^2) $$

$$ \forall 0 < i < D - 1 : \frac{\partial f}{\partial x_i} = -2(1 - x_i) - 400 x_i (x_{i + 1} - x_i^2) + 200(x_i - x_{i - 1}^2) $$

In [168]:
def rosenbrock_grad(x):
    g = np.zeros(len(x))
    g[0] = -2 * (1 - x[0]) - 400 * x[0] * (x[1] - x[0] ** 2)
    g[-1] = 200 * (x[-1] - x[-2] ** 2)
    for i in range(1, len(x) - 1):
        g[i] = -2 * (1 - x[i]) - 400 * x[i] * (x[i + 1] - x[i] ** 2) + 200 * (x[i] - x[i - 1] ** 2)
    return g
In [380]:
def polak_ribiere(f, x, d):
    eval_count = 1
    lines_count = 0
    fx = f(x)
    g = rosenbrock_grad(x)
    
    diff = 1000 # just some big number
    
    # save for plotting
    all_lines = [(x, fx)]
    
    while diff > 1e-4:
        
        x_new, fx_new, evals = line_minimize(f, x, d)
        
        all_lines.append((x_new, fx_new))
        eval_count += evals
        
        g_new = rosenbrock_grad(x_new)
        
        gamma = np.dot(g_new, g_new - g) / np.dot(g, g)
        
        d = g_new + gamma * d
        g = g_new
        x = x_new
        diff = abs(fx_new - fx)
        fx = fx_new
        
    return x, fx, eval_count, all_lines
In [381]:
x0 = np.array([-1, -1])
d0 = np.array([1, 0.1])

x, fx, evals, lines = polak_ribiere(rosenbrock, x0, d0)

print("Best point:", x)
print("Value:", fx)
print("Total Lines:", len(lines))
print("Total Evaluations:", evals)
Best point: [1.00707275 1.01542417]
Value: 0.00020098098280117396
Total Lines: 17
Total Evaluations: 292
In [382]:
fig, ax = plot_rosenbrock(1)
ax.set_title("Rosenbrock - Conjugate Gradient", y=1.1)

def update_anim(i):
    if i > 0:
        x = [lines[i-1][0][0], lines[i][0][0]]
        y = [lines[i-1][0][1], lines[i][0][1]]
        z = [lines[i-1][1], lines[i][1]]
        ax.plot3D(x, y, z, 'red')

animation.FuncAnimation(fig, update_anim, range(len(lines)), blit=False, interval=300, repeat=True)
Out[382]:
In [383]:
x0 = np.array([-1, -1])
d0 = np.array([0.1, 1])

x, fx, evals, lines = polak_ribiere(rosenbrock, x0, d0)

print("Best point:", x)
print("Value:", fx)
print("Total Lines:", len(lines))
print("Total Evaluations:", evals)

fig, ax = plot_rosenbrock(1)
ax.set_title("Rosenbrock - Conjugate Gradient", y=1.1)

def update_anim(i):
    if i > 0:
        x = [lines[i-1][0][0], lines[i][0][0]]
        y = [lines[i-1][0][1], lines[i][0][1]]
        z = [lines[i-1][1], lines[i][1]]
        ax.plot3D(x, y, z, 'red')

animation.FuncAnimation(fig, update_anim, range(len(lines)), blit=False, interval=300, repeat=True)
Best point: [0.61968544 0.37628264]
Value: 0.15061043139730068
Total Lines: 12
Total Evaluations: 206
Out[383]:

This is not so good...

So according to Wikipedia, a smart idea is to start with the direction of the steepest descent, then every $D $iterations to reset direction $d$ to the steepest gradient at that point. Apparently the direction loses conjugacy after D iterations (or less)

If after a reset we have no progress, we're done.

In [243]:
def polak_ribiere(f, x, d):
    eval_count = 1
    lines_count = 0
    fx = f(x)
    g = rosenbrock_grad(x)
    
    # save for plotting
    all_lines = [(x, fx)]
    i = 0
    
    while True:    
        x_new, fx_new, evals = line_minimize(f, x, d)
    
        if i == 0 and abs(fx - fx_new) < 1e-4:
            return x, fx, eval_count, all_lines
        
        all_lines.append((x_new, fx_new))
        eval_count += evals
        
        g_new = rosenbrock_grad(x_new)
        
        i += 1
        if i < max(10, len(x)): # if D is small (2), we reset too often and this algo becomes steepest descent
            gamma = np.dot(g_new, g_new - g) / np.dot(g, g)

            d = g_new + gamma * d
            g = g_new
            x = x_new
            fx = fx_new
        else:
            # RESET
            d = -g_new
            g = g_new
            x = x_new
            fx = fx_new
            i = 0
In [244]:
x0 = np.array([-1, -1])
d0 = np.array([0.1, 1])

x, fx, evals, lines = polak_ribiere(rosenbrock, x0, d0)

print("Best point:", x)
print("Value:", fx)
print("Total Lines:", len(lines))
print("Total Evaluations:", evals)
Best point: [1.04290379 1.08917117]
Value: 0.002072645682276332
Total Lines: 21
Total Evaluations: 363
In [245]:
fig, ax = plot_rosenbrock(1)
ax.set_title("Rosenbrock - Conjugate Gradient", y=1.1)

def update_anim(i):
    if i > 0:
        x = [lines[i-1][0][0], lines[i][0][0]]
        y = [lines[i-1][0][1], lines[i][0][1]]
        z = [lines[i-1][1], lines[i][1]]
        ax.plot3D(x, y, z, 'red')

animation.FuncAnimation(fig, update_anim, range(len(lines)), blit=False, interval=300, repeat=True)
Out[245]:

Now in 10 dimensions

In [246]:
D = 10

x0 = -np.ones(D)
d0 = -rosenbrock_grad(x0) # steepest descent

x, fx, evals, lines = polak_ribiere(rosenbrock, x0, d0)

print("Best point:", x)
print("Value:", fx)
print("Total Lines:", len(lines))
print("Total Evaluations:", evals)
Best point: [0.99912705 0.99846906 0.99680088 0.99374817 0.9868494  0.97374107
 0.94760058 0.89775446 0.80538762 0.64768422]
Value: 0.05220843653821661
Total Lines: 91
Total Evaluations: 1873

(e) CMA-ES (or something close to it)

A good clear overview of the algorithm - The CMA Evolution Strategy: A Tutorial (Hansen 2016)

In [363]:
def cma_es(f, x0, sigma, population_size, exp_filter=0.5, iterations=100):
    eval_count = 0
    N = len(x0)
    
    mean = x0
    cov = np.identity(N) 
    
    # for plotting
    all_dists = [(mean, cov)]
    
    for j in range(iterations):
        # sample population from a multivariate distribution
        population = []
        for i in range(population_size):
            x = np.random.multivariate_normal(mean, cov * sigma)
            fx = f(x)
            eval_count += 1
            population.append((fx, x))

        # now we need to use the fx's (fitness) to generate weights for each sample in the population
        # first, let's sort the population according to fitness
        population.sort(key = lambda x : x[0])
        
        # according to the tutorial it's a good idea to consider half of the population
        # so half of the weights will be zero
        # now I played around with two weight scheme, one where they all are equal:
        # A) equal weights
        weights = np.zeros(population_size)
        for i in range(population_size // 2):
            weights[i] = 1
        weights /= np.sum(weights)
        # Update: This works fine for 2D, but doesn't converge for 10D!
        
        # B) descending weights:
        weights = np.zeros(population_size)
        for i in range(population_size // 2):
            weights[i] = (population_size // 2) - i
        weights /= np.sum(weights)
        
        # Calculate the new weighted mean
        mean_new = np.zeros(N)
        for i in range(population_size // 2):
            mean_new += weights[i] * population[i][1]
        
        # now we need to update the covariance matrix
        # there seems to be a lot of ways to do this
        # i'm going to try and keep it simple and apply equaton (12) from page 10 in the tutorial referenced above
        cov_new = np.zeros((N, N))
        for i in range(population_size // 2):
            v = population[i][1] - mean
            cov_new = np.add(cov_new, weights[i] * np.outer(v, v))

        # update params using exponential filter
        mean = exp_filter * mean + (1 - exp_filter) * mean_new
        cov = exp_filter * cov + (1 - exp_filter) * cov_new
        
        # when to stop??
        
        all_dists.append((mean, cov))
        
    return mean, eval_count, all_dists
In [364]:
x0 = np.array([-1, -1])
sigma = 2 # step size
population_size = 20
iterations = 500

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(35, 35)

ax.set_title("Rosenbrock - CMA-ES (Population: %d, Step: %d, Iter: %d)" % (population_size, sigma, iterations), y=1.1)

xdata = [dist[0][0] for dist in dists]
ydata = [dist[0][1] for dist in dists]
zdata = [rosenbrock(dist[0]) for dist in dists]
ax.scatter3D(xdata, ydata, zdata, c='red', s=30)
plt.show()
Best point: [1. 1.]
Value: 4.942706609275402e-30
Total Evaluations: 10000
In [365]:
x0 = np.array([-1, -1])
sigma = 2 # step size
population_size = 5
iterations = 500

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(35, 35)

ax.set_title("Rosenbrock - CMA-ES (Population: %d, Step: %d, Iter: %d)" % (population_size, sigma, iterations), y=1.1)

xdata = [dist[0][0] for dist in dists]
ydata = [dist[0][1] for dist in dists]
zdata = [rosenbrock(dist[0]) for dist in dists]
ax.scatter3D(xdata, ydata, zdata, c='red', s=30)
plt.show()
Best point: [1. 1.]
Value: 1.9721522630525295e-31
Total Evaluations: 2500
In [366]:
x0 = np.array([-1, -1])
sigma = 1 # step size
population_size = 20
iterations = 500

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(35, 35)

ax.set_title("Rosenbrock - CMA-ES (Population: %d, Step: %d, Iter: %d)" % (population_size, sigma, iterations), y=1.1)

xdata = [dist[0][0] for dist in dists]
ydata = [dist[0][1] for dist in dists]
zdata = [rosenbrock(dist[0]) for dist in dists]
ax.scatter3D(xdata, ydata, zdata, c='red', s=30)
plt.show()
Best point: [1. 1.]
Value: 0.0
Total Evaluations: 10000
In [367]:
x0 = np.array([-1, -1])
sigma = 5  # step size
population_size = 20
iterations = 500

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

fig = plt.figure()
ax = plt.axes(projection='3d')
ax.view_init(35, 35)

ax.set_title("Rosenbrock - CMA-ES (Population: %d, Step: %d, Iter: %d)" % (population_size, sigma, iterations), y=1.1)

xdata = [dist[0][0] for dist in dists]
ydata = [dist[0][1] for dist in dists]
zdata = [rosenbrock(dist[0]) for dist in dists]
ax.scatter3D(xdata, ydata, zdata, c='red', s=30)
plt.show()
/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:15: RuntimeWarning: covariance is not symmetric positive-semidefinite.
  from ipykernel import kernelapp as app
Best point: [1.50973321e+17 1.96430070e+33]
Value: 4.338323693973395e+70
Total Evaluations: 10000

Now for 10 dimensions

In [394]:
D = 10
x0 = -np.ones(D)
sigma = 2 # step size
population_size = 20
iterations = 500

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
Best point: [ 0.2242136  -0.07551011 -0.1398226   0.56014533  0.12190915 -0.80834995
  0.47372534 -0.07613882  0.55698259  1.82724287]
Value: 386.0725249112632
Total Evaluations: 10000
In [395]:
D = 10
x0 = -np.ones(D)
sigma = 1 # step size
population_size = 20
iterations = 500

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
Best point: [ 0.16863649  0.08887491 -0.05707658 -0.01480126  0.0812831   0.02732766
  0.00572325  0.08664034 -0.02576304  0.07594985]
Value: 11.283854519705866
Total Evaluations: 10000
In [396]:
D = 10
x0 = -np.ones(D)
sigma = 1 # step size
population_size = 20
iterations = 3000

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations, exp_filter=0.9)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
Best point: [ 0.90865591  0.82172883  0.64485622  0.39645754  0.1731705   0.02674653
  0.04364712 -0.02211765 -0.00224887  0.01576867]
Value: 5.550883692531492
Total Evaluations: 60000
In [397]:
D = 10
x0 = -np.ones(D)
sigma = 1 # step size
population_size = 20
iterations = 10000

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations, exp_filter=0.9)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
Best point: [0.9102409  0.84927127 0.70259077 0.51797466 0.23200081 0.06059494
 0.01435903 0.01912643 0.01813716 0.01699063]
Value: 5.101523099858765
Total Evaluations: 200000
In [398]:
D = 10
x0 = -np.ones(D)
sigma = 1 # step size
population_size = 100
iterations = 3000

x, evals, dists = cma_es(rosenbrock, x0, sigma, population_size, iterations=iterations, exp_filter=0.9)

print("Best point:", x)
print("Value:", rosenbrock(x))
print("Total Evaluations:", evals)

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()

xdata = range(len(dists))
ydata = [rosenbrock(dist[0]) for dist in dists]
plt.plot(xdata, ydata)
plt.ylim(0, 100)
plt.ylabel("f(x)")
plt.xlabel("Iterations")
plt.axhline(0, color='red')
plt.show()
Best point: [1.00031932 1.00169721 0.99540408 0.99455518 0.98400207 0.96321727
 0.91752566 0.85165603 0.71761767 0.50434232]
Value: 0.16065765270112953
Total Evaluations: 300000

Update, running the above experiments with the equal weight scheme didn't work... however, for another simple weight scheme:

$$ w_i = \lambda - i + 1 $$ (where $\lambda$ is the population size)

The last try, with population size 100 and sigma 1 works!