(15.1)

(a) Plot the Rosenbrock function [Rosenbrock, 1960]

$f = (1 - x)^2 + 100(y - x^2)^2$

In [65]:
# From Neil's nm.py code

import copy as cp
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

def f(x):
    global evals
    evals += 1
    return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2

plot_size = 100
xy_min = -1.1
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_proj_type('ortho')
surf = ax.plot_surface(X, Y, Z)
ax.view_init(35, 59)
plt.title("Rosenbrock Function")
plt.show()

(b) Pick a stopping criterion and use the downhill simplex method to search for its minimum starting from $x = y = -1$, plotting the search path, and counting the number of algorithm iterations and function evaluations used.

In [66]:
evals = 0

def nelder_mead(f, start, start_size, tolerance):
    global evals
    evals = 0
    d = start.size
    
    x = []
    x.append({'pt':cp.deepcopy(start), 'fn':f(start)})
    for i in range(d):
        x.append({'pt':cp.deepcopy(start), 'fn':0.0})
        x[-1]['pt'][i] += start_size
        x[-1]['fn'] = f(x[-1]['pt'])
        
    x_s = []
    x.sort(key=lambda item:item['fn'])
    x_s.append(cp.deepcopy(x))
    
    while True:
        mid = np.zeros(d)
        for i in range(d):
            mid += x[i]['pt']
        mid = mid/d
        
        y = mid + (mid - x[-1]['pt'])
        f_y = f(y)
        
        if (f_y < x[0]['fn']):
            z = mid + 2.0*(mid - x[-1]['pt'])
            f_z = f(z)
            
            if (f_z < f_y):
                x[-1] = {'pt':z, 'fn':f_z}
            else:
                x[-1] = {'pt':y, 'fn':f_y}
                
        elif (f_y < x[-2]['fn']):
            x[-1] = {'pt':y, 'fn':f_y}
            
        else:
            y = mid + 0.5*(mid - x[-1]['pt'])
            f_y = f(y)
            
            if (f_y < x[-2]['fn']):
                x[-1] = {'pt':y, 'fn':f_y}
            else:
                y = mid - 0.5*(mid - x[-1]['pt'])
                f_y = f(y)
                
                if (f_y < x[-2]['fn']):
                    x[-1] = {'pt':y, 'fn':f_y}
                else:
                    for i in range(1, d+1):
                        x[i]['pt'] = (x[0]['pt'] + x[i]['pt'])/2.0
                        x[i]['fn'] = f(x[i]['pt'])
                        
        x.sort(key=lambda item:item['fn'])
        x_s.append(cp.deepcopy(x))
        
        error = abs(x_s[-1][0]['fn'] - x_s[-2][0]['fn'])
        
        if ((error > 0) & (error < tolerance)):
            break
            
    return x_s

D = 2
start_size = 0.1
tolerance = 1e-6
start = -1*np.ones(D)
x_s = nelder_mead(f, start, start_size, tolerance)

plot_size = 100
xy_min = -1.1
xy_max = 1.3
x = np.arange(xy_min,xy_max,(xy_max-xy_min)/100)
y = np.arange(xy_min,xy_max,(xy_max-xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])

plt.ion()
point = ''

for i in range(len(start)):
    point += '%.3f '%x_s[-1][0]['pt'][i]
    
title = 'Rosenbrock Nelder-Mead search'\
    +'\nfinal point: ' + point\
    +'\ntolerance: ' + str(tolerance)\
    +'\niterations: ' + str(len(x_s))\
    +'  function evaluations: ' + str(evals)

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_proj_type('ortho')
surf = ax.plot_surface(X, Y, Z)
ax.view_init(35, 59)

for i in range(len(x_s)):
    x = []
    y = []
    z = []
    x.append(x_s[i][0]['pt'][0])
    y.append(x_s[i][0]['pt'][1])
    z.append(x_s[i][0]['fn'])
    x.append(x_s[i][1]['pt'][0])
    y.append(x_s[i][1]['pt'][1])
    z.append(x_s[i][1]['fn'])
    x.append(x_s[i][2]['pt'][0])
    y.append(x_s[i][2]['pt'][1])
    z.append(x_s[i][2]['fn'])
    x.append(x_s[i][0]['pt'][0])
    y.append(x_s[i][0]['pt'][1])
    z.append(x_s[i][0]['fn'])
    ax.plot(x, y, z, color='black', marker='.')

plt.title(title)
plt.show()

(c) Now repeat the simplex search for $D = 10$ in the D-dimensional generalizations of the Rosenbrock function:

$f = \sum_{i=1}^{D-1} (1 - x_{i-1})^2 + 100(x_i - x_{i-1}^2)^2$

Start all the $x_i$ at -1, plot a 2D projection of the search path, and again count algorithm iterations and function evaluations.

In [67]:
plot_size = 100
xy_min = -1.5
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])

def f(x):
    global evals, D
    evals += 1
    sum = 0
    for i in range(1, D):
        sum += (1 - x[i-1])**2 + 100*(x[i] - x[i-1]**2)**2
    return sum

D = 10
start_size = 0.1
tolerance = 1e-6
start = -1*np.ones(D)
x_s = nelder_mead(f, start, start_size, tolerance)

point = ''

for i in range(len(start)):
    point += '%.3f '%x_s[-1][0]['pt'][i]

title = '10D Rosenbrock Nelder-Mead search'\
    + '\nfinal point: ' + point\
    + '\ntolerance: ' + str(tolerance)\
    + '\niterations: ' + str(len(x_s))\
    + '  function evaluations: ' + str(evals)

fig, ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=0.5)

x = [[],[],[],[]]
y = [[],[],[],[]]

for i in range(len(x_s)):
    x[0].append(x_s[i][0]['pt'][0])
    y[0].append(x_s[i][0]['pt'][1])
    x[1].append(x_s[i][1]['pt'][0])
    y[1].append(x_s[i][1]['pt'][1])
    x[2].append(x_s[i][2]['pt'][0])
    y[2].append(x_s[i][2]['pt'][1])
    x[3].append(x_s[i][0]['pt'][0])
    y[3].append(x_s[i][0]['pt'][1])
    
ax.plot(x, y, color='white', marker='.')
plt.title(title)
plt.show()

(d) Repeat and compare the 2D and 10D Rosenbrock search with the gradient descent method.

In [68]:
# From Neil's gd.py code

import copy as cp
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

evals = 0

def gradient_descent(f, df, start, start_scale, rate, momentum, tolerance):
    scale = start_scale
    x = np.copy(start)
    last_x = np.copy(start)
    last_last_x = np.copy(start)
    pts = [{'pt':np.copy(x), 'fn':f(x)}]
    
    while True:
        last_last_x = np.copy(last_x)
        last_x = np.copy(x)
        x_new = x - scale*df(x) + momentum*(last_x - last_last_x)
        f_n = f(x_new)
        
        if (f_n <= pts[-1]['fn']):
            x = np.copy(x_new)
            pts.append({'pt':np.copy(x), 'fn':f_n})
            scale *= rate
            error = abs(pts[-1]['fn'] - pts[-2]['fn'])
            
            if (error < tolerance):
                break
        
        else:
            scale *= 1/rate
    
    return pts

def f(x):
    global evals
    evals += 1
    return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2

def df(x):
    global evals
    evals += 2
    return np.array([-2*(1 - x[0]) - 400*(x[1] - x[0]**2)*x[0], 200*(x[1] - x[0]**2)])

D = 2
start = -1*np.ones(D)
start_scale = 1e-5
rate = 1.5
momentum = 0.95
tolerance = 1e-6
pts = gradient_descent(f, df, start, start_scale, rate, momentum, tolerance)

plot_size = 100
xy_min = -1.1
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])

x = []
y = []
z = []

for i in range(len(pts)):
    x.append(pts[i]['pt'][0])
    y.append(pts[i]['pt'][1])
    z.append(pts[i]['fn'])
    
point = ''

for i in range(len(start)):
    point += '%.3f '%pts[-1]['pt'][i]
    
title = '2D Rosenbrock gradient descent search (with momentum)'\
    + '\nfinal point: ' + point\
    + '\ntolerance: ' + str(tolerance)\
    + '\niterations: ' + str(len(pts))\
    + '  function evaultions ' + str(evals)

plt.ion()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_proj_type('ortho')
ax.plot_surface(X, Y, Z)
ax.view_init(35, 59)
ax.plot(x, y, z, color='black')
plt.title(title)
plt.show()
In [69]:
plot_size = 100
xy_min = -1.5
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])

def f(x):
    global evals, D
    evals += 1
    sum = 0
    for i in range(1, D):
        sum += (1 - x[i-1])**2 + 100*(x[i] - x[i-1]**2)**2
    return sum

def df(x):
    global evals, D
    evals += D
    dim = np.zeros(D)
    dim[0] = -2*(1 - x[0]) - 400*(x[1] - x[0]**2)*x[0]
    dim[D-1] = 200*(x[D-1] - x[D-2]**2)
    for i in range(1, D-1):
        dim[i] = -2*(1 - x[i]) - 400*(x[i+1] - x[i]**2)*x[i] + 200*(x[i] - x[i-1]**2)
    return dim

D = 10
start = -1*np.zeros(D)
start_scale = 1e-5
rate = 1.5
tolerance = 1e-6
momentum = 0.95
pts = gradient_descent(f, df, start, start_scale, rate, momentum, tolerance)

point = ''

for i in range(len(start)):
    point += '%.3f '%pts[-1]['pt'][i]
    
title = '10D Rosenbrock gradient descent search (with momentum)'\
    + '\nfinal point: ' + point\
    + '\ntolerance: ' + str(tolerance)\
    + '\niterations: ' + str(len(pts))\
    + '  function evaluations: ' + str(evals)

x = []
y = []
z = []

for i in range(len(pts)):
    x.append(pts[i]['pt'][0])
    y.append(pts[i]['pt'][1])
    z.append(pts[i]['fn'])
    
fig, ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=0.5)
ax.plot(x, y, color='white')
plt.title(title)
plt.show()

(e) Repeat and compare the 2D and 10D Rosenbrock search with the conjugate gradient method.

In [70]:
# From Neil's cg.py code

import copy as cp
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D

evals = 0

def conjugate_gradient(f, df, start, start_step, tolerance):
    golden = (1 + math.sqrt(5))/2
    pts = []
    t0 = {}
    t1 = {'pt':np.copy(start), 'fn':f(start)}
    t2 = {}
    t3 = {}
    pts.append(cp.deepcopy(t1))
    dir = df(start)
    grad = df(start)
    step = 0
    step_size = start_step
    
    while True:
        step += 1
        
        dir_old = np.copy(dir)
        grad_old = np.copy(grad)
        grad = df(t1['pt'])
        gamma = np.dot(grad, grad - grad_old)/np.dot(grad_old, grad_old)
        dir = grad + gamma*dir_old
        
        if (step % (dim + 1) == 0):
            dir = np.copy(grad)
            
        norm_dir = -dir/np.sqrt(np.dot(dir, dir))
        
        t0 = cp.deepcopy(t1)
        
        while True:
            t1['pt'] = t0['pt'] + step_size*norm_dir
            t1['fn'] = f(t1['pt'])
            
            if (t1['fn'] > t0['fn']):
                step_size *= 0.5
                continue
                
            t2['pt'] = t0['pt'] + (t1['pt'] - t0['pt'])*golden
            t2['fn'] = f(t2['pt'])
            break
            
        while True:
            if (t1['fn'] < t2['fn']):
                break
            
            t0 = cp.deepcopy(t1)
            t1 = cp.deepcopy(t2)
            t2['pt'] = t1['pt'] + golden*(t1['pt'] - t0['pt'])
            t2['fn'] = f(t2['pt'])
            
        while True:
            t3['pt'] = t1['pt'] + (t1['pt'] - t0['pt'])/golden
            t3['fn'] = f(t3['pt'])
            
            if(t3['fn'] > t1['fn']):
                t2 = cp.deepcopy(t0)
                t0 = cp.deepcopy(t3)
                
            else:
                t0 = cp.deepcopy(t1)
                t1 = cp.deepcopy(t3)
                
            error = abs(t1['fn'] - t0['fn']) + abs(t2['fn'] - t1['fn'])
            
            if (error < tolerance):
                break
            
        pts.append(cp.deepcopy(t1))
        error = abs(pts[-1]['fn'] - pts[-2]['fn'])
        
        if (error < tolerance):
            break
            
    return pts

def f(x):
    global evals
    evals += 1
    return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2

def df(x):
    global evals
    evals += 2
    return np.array([-2*(1 - x[0]) - 400*(x[1] - x[0]**2)*x[0], 200*(x[1] - x[0]**2)])

dim = 2
start = -1*np.ones(dim)
start_step = 0.1
tolerance = 1e-6
pts = conjugate_gradient(f, df, start, start_step, tolerance)

plot_size = 100
xy_min = -1.1
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])

x = []
y = []
z = []

for i in range(len(pts)):
    x.append(pts[i]['pt'][0])
    y.append(pts[i]['pt'][1])
    z.append(pts[i]['fn'])
    
point = ''

for i in range(len(start)):
    point += '%.3f '%pts[-1]['pt'][i]
    
title = '2D Rosenbrock conjugate gradient search'\
    + '\nfinal point: ' + point\
    + '\ntolerance: ' + str(tolerance)\
    + '\niterations: ' + str(len(pts))\
    + '  function evaluations: ' + str(evals)

plt.ion()
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.set_proj_type('ortho')
ax.plot_surface(X, Y, Z)
ax.view_init(35, 59)
ax.plot(x, y, z, color='black')
plt.title(title)
plt.show()
In [71]:
plot_size = 100
xy_min = -1.5
xy_max = 1.3
x = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
y = np.arange(xy_min, xy_max, (xy_max - xy_min)/100)
(X,Y) = np.meshgrid(x,y)
Z = f([X,Y])

def f(x):
    global evals, dim
    evals += 1
    sum = 0
    for i in range(1, dim):
        sum += (1 - x[i-1])**2 + 100*(x[i] - x[i-1]**2)**2
    return sum

def df(x):
    global evals, dim
    evals += dim
    d = np.zeros(dim)
    d[0] = -2*(1 - x[0]) - 400*(x[1] - x[0]**2)*x[0]
    d[dim-1] = 200*(x[dim-1] - x[dim-2]**2)
    for i in range(1, dim-1):
        d[i] = -2*(1 - x[i]) - 400*(x[i+1] - x[i]**2)*x[i] + 200*(x[i] - x[i-1]**2)
    return d

dim = 10
start = -1*np.ones(dim)
start_step = 0.1
tolerance = 1e-6
pts = conjugate_gradient(f, df, start, start_step, tolerance)

point = ''

for i in range(len(start)):
    point += '%.3f '%pts[-1]['pt'][i]
    
title = '10D Rosenbrock conjugate gradient search'\
    + '\nfinal point: ' + point\
    + '\ntolerance: ' + str(tolerance)\
    + '\niterations: ' + str(len(pts))\
    + '  function evaluations: ' + str(evals)

x = []
y = []

for i in range(len(pts)):
    x.append(pts[i]['pt'][0])
    y.append(pts[i]['pt'][1])
    
fig, ax = plt.subplots(1,1)
ax.contourf(X, Y, Z, 1000, vmax=100, cmap=cm.rainbow)
ax.contour(X, Y, Z, 50, linewidths=0.5)
ax.plot(x, y, color='white')
plt.title(title)
plt.show()