from __future__ import division
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D, proj3d
from matplotlib.cm import inferno as colormap
from matplotlib.colors import LogNorm


from matplotlib import rc
import matplotlib.animation as animation
import seaborn as sns
from mpl_toolkits import mplot3d
import matplotlib.cm as cm
rc('animation', html='html5')
sns.set(style='whitegrid', rc={'figure.figsize':(10,8)})

a) Plot the Rosenbrock function¶

The general formula for the function is:

$f(x, y) = (a - x)^2 + b(y - x^2)^2$

, which has a minimum at $(a, a^2)$.

A function with many nonconvexities and points that are close to being a minimum.

I graph the function below:

Neil has given us:

$f(x, y) = (1 - x)^2 + 100(y - x^2)^2$

and thus I expect a minimum at $(1,1)$.

2D Plot¶

# Define Rosenbrock Function
def rosenbrock(x, a=1, b=100):
    y = x[1]
    x = x[0]
    return (a - x)**2 + b*(y - x**2)**2

x = np.linspace(-2, 2, 500)
y = np.linspace(-2, 2, 500)

X, Y = np.meshgrid(x, y)

Z = rosenbrock([X, Y])

fig = plt.figure(figsize=(16, 6))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122, projection="3d")
ax1.set_xlim([-2, 2])
ax1.set_ylim([-2, 2])
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax2.set_xlabel("x")
ax2.set_ylabel("y")
ax2.set_zlim([0, 2500])

# Color mesh
ax1.set_facecolor("white")
ax1.pcolormesh(X, Y, Z, cmap=matplotlib.cm.jet,norm=LogNorm())
ax1.scatter(1, 1, color="k")
ax1.annotate('Global Min', xy=(1, 1), xytext=(-0.5, 1.25),arrowprops=dict(facecolor='black', shrink=0.05))

# Surface plot
ax2.set_facecolor("white")
ax2.plot_surface(X, Y, Z, norm = LogNorm(), cmap=matplotlib.cm.jet, linewidth=0)
ax2.view_init(azim=65, elev=25)
ax2.scatter(1., 1., 0., color="k")
xa, ya, _ = proj3d.proj_transform(1,1,0, ax2.get_proj())
ax2.annotate("Global Min", xy = (xa, ya), xytext = (-20, 30), textcoords = 'offset points', ha = 'right', va = 'bottom', arrowprops=dict(facecolor='black', shrink=0.05))

plt.tight_layout()
plt.show()

b) Pick a stopping criterion and use the downhill simplex method to search for its minimum starting from x = y = −1, plotting the search path, and counting the number of algorithm iterations and function evaluations used.¶

Her I discuss the downhill simplex(or Nelder-Mead) algorithm. The main idea is very simple:

Order the points
Create some new points
Replace the point with the largest function value
Repeat

Following Neil's book & this paper: Nice Nelder-Mead Paper!, I setup my code operations:

There are two ways to obtain the initial simplex. The first way is to simply pass in a simplex $\Delta$ as the guess. The second is to pass in a single point, $x_0$, and create the simplex based around that point (we do this by using $x_0$ as one point of the simplex and by using $x_i := x_0 + e_i \varepsilon$ for the $n$ other points).
Once we have the simplex, we evaluate the function at each of the points in the simplex and sort the points such that $x_1 \leq x_2 \leq \dots \leq x_{n+1}$ and find $\bar{x} := \frac{1}{n} \sum_{i=1}^n x_i$.
If $|f(\bar{x}) - f(x_1)|$(or another convergence metric of your choosing) then return $x_1$ as the minimum value, otherwise proceed.
Create a reflected point $x_r$.

4.1 If $f(x_1) \leq f(x_r) < f(x_n)$ then replace $x_{n+1}$ with $x_r$

4.2 Return to step 2.
If $f(x_r) < f(x_1)$ then create the expanded point $x_e$.

5.1 If $f(x_e) < f(x_r)$ then replace $x_{n+1}$ with $x_e$

5.2 Else if $f(x_r) < f(x_e)$ then replace $x_{n+1}$ with $x_r$.

5.3 Return to step 2.
If $f(x_n) < f(x_r) < f(x_{n+1})$ then create the outside contraction point $x_{oc}$.

6.1 If $f(x_{oc}) < f(x_r)$ then replace $x_{n+1}$ with $x_{oc}$

6.2 Else, shrink the points towards $x_1$

6.3 Return to step 2
If $f(x_{n+1}) < f(x_r)$ then create the inside contraction point $x_{ic}$.

7.1 If $f(x_{ic}) < f(x_r)$ then replace $x_{n+1}$ with $x_{ic}$

7.2 Else, shrink the points towards $x_1$

7.3 Return to step 2

That is all of the algorithm. As previously stated, you can see that it simply applies our main operations repeatedly until we converge. I have written a simple implementation of the algorithm below.

def nmd(f, x0, method="ANMS", tol=1e-8, maxit=1e4, iter_returns=None):
    """
    Parameters
    ----------

    f : callable
        Function to minimize
    x0 : scalar(float) or array_like(float, ndim=1)
        The initial guess for minimizing
    method : string or tuple(floats)
        If a string, should specify ANMS or NMS then will use specific
        parameter values, but also can pass in a tuple of parameters in
        order (alpha, beta, gamma, delta), which are the reflection,
        expansion, contraction, and contraction parameters
    tol : scalar(float)
        The tolerance level to achieve convergence
    maxit : scalar(int)
        The maximimum number of iterations allowed
    """
    #-----------------------------------------------------------------#
    # Set some parameter values
    #-----------------------------------------------------------------#
    init_guess = x0
    fx0 = f(x0)
    dist = 10.
    curr_it = 0

    # Get the number of dimensions we are optimizing
    n = np.size(x0)

    # Will use the Adaptive Nelder-Mead Simplex paramters by default
    if method is "ANMS":
        alpha = 1.
        beta = 1. + (2./n)
        gamma = .75 - 1./(2.*n)
        delta = 1. - (1./n)
    # Otherwise can use standard parameters
    elif method is "NMS":
        alpha = 1.
        beta = 2.
        gamma = .5
        delta = .5
    elif type(method) is tuple:
        alpha, beta, gamma, delta = method


    #-----------------------------------------------------------------#
    # Create the simplex points and do the initial sort
    #-----------------------------------------------------------------#
    simplex_points = np.empty((n+1, n))

    pt_fval = [(x0, fx0)]

    simplex_points[0, :] = x0

    for ind, elem in enumerate(x0):

        if np.abs(elem) < 1e-14:
            curr_tau = 0.00025
        else:
            curr_tau = 0.05

        curr_point = np.squeeze(np.eye(1, M=n, k=ind)*curr_tau + x0)

        simplex_points[ind, :] = curr_point
        pt_fval.append((curr_point, f(curr_point)))
        
    if iter_returns is not None:
        ret_points = []
    else:
        ret_points = None


    #-----------------------------------------------------------------#
    # The Core of The Nelder-Mead Algorithm
    #-----------------------------------------------------------------#
    while dist>tol and curr_it<maxit:

        # 1: Sort and find new center point (excluding worst point)
        pt_fval = sorted(pt_fval, key=lambda v: v[1])
        xbar = x0*0

        for i in range(n):
            xbar = xbar + (pt_fval[i][0])/(n)
            
        if iter_returns is not None and curr_it in iter_returns:
            ret_points.append(pt_fval)

        # Define useful variables
        x1, f1 = pt_fval[0]
        xn, fn = pt_fval[n-1]
        xnp1, fnp1 = pt_fval[n]


        # 2: Reflect
        xr = xbar + alpha*(xbar - pt_fval[-1][0])
        fr = f(xr)

        if f1 <= fr < fn:
            # Replace the n+1 point
            xnp1, fnp1 = (xr, fr)
            pt_fval[n] = (xnp1, fnp1)

        elif fr < f1:
            # 3: expand
            xe = xbar + beta*(xr - xbar)
            fe = f(xe)

            if fe < fr:
                xnp1, fnp1 = (xe, fe)
                pt_fval[n] = (xnp1, fnp1)
            else:
                xnp1, fnp1 = (xr, fr)
                pt_fval[n] = (xnp1, fnp1)

        elif fn <= fr <= fnp1:
            # 4: outside contraction
            xoc = xbar + gamma*(xr - xbar)
            foc = f(xoc)

            if foc <= fr:
                xnp1, fnp1 = (xoc, foc)
                pt_fval[n] = (xnp1, fnp1)
            else:
                # 6: Shrink
                for i in range(1, n+1):
                    curr_pt, curr_f = pt_fval[i]
                    # Shrink the points
                    new_pt = x1 + delta*(curr_pt - x1)
                    new_f = f(new_pt)
                    # Replace
                    pt_fval[i] = new_pt, new_f

        elif fr >= fnp1:
            # 5: inside contraction
            xic = xbar - gamma*(xr - xbar)
            fic = f(xic)

            if fic <= fr:
                xnp1, fnp1 = (xic, fic)
                pt_fval[n] = (xnp1, fnp1)
            else:
                # 6: Shrink
                for i in range(1, n+1):
                    curr_pt, curr_f = pt_fval[i]
                    # Shrink the points
                    new_pt = x1 + delta*(curr_pt - x1)
                    new_f = f(new_pt)
                    # Replace
                    pt_fval[i] = new_pt, new_f

        # Compute the distance and increase iteration counter
        dist = abs(fn - f1)
        curr_it = curr_it + 1


    if curr_it == maxit:
        raise ValueError("Max iterations; Convergence failed.")
        
    if ret_points:
        return x1, f1, curr_it, ret_points
    else:
        return x1, f1, curr_it

iterstosee = [0, 1, 2, 3, 4, 5, 10, 20, 50, 75, 90, 95]
x, fx, its, ret_tris = nmd(rosenbrock, x0=np.array([-1, -1.]), tol=1e-12, iter_returns=iterstosee)

fig, axs = plt.subplots(nrows=6, ncols=2, figsize=(16, 24))
axs = axs.flatten()

# Color mesh
for i, curr_ax in enumerate(axs):
    curr_simplex = np.vstack([ret_tris[i][0][0], ret_tris[i][1][0], ret_tris[i][2][0]])
    curr_ax.pcolormesh(X, Y, Z, cmap=matplotlib.cm.jet,
               norm=LogNorm())
    curr_ax.set_title("This is simplex for iteration %i" %iterstosee[i])
    curr_ax.scatter(curr_simplex[:, 0], curr_simplex[:, 1])

plt.tight_layout()
plt.show()

its

98

ret_tris

[[(array([-0.95, -1.  ]), 365.75312499999995),
  (array([-1.  , -0.95]), 384.25),
  (array([-0.925, -0.925]), 320.76816406249986)],
 [(array([-0.925, -0.925]), 320.76816406249986),
  (array([-0.95, -1.  ]), 365.75312499999995),
  (array([-0.8125, -0.9875]), 274.7622680664062)],
 [(array([-0.8125, -0.9875]), 274.7622680664062),
  (array([-0.925, -0.925]), 320.76816406249986),
  (array([-0.70625, -0.86875]), 189.92759780883767)],
 [(array([-0.70625, -0.86875]), 189.92759780883767),
  (array([-0.8125, -0.9875]), 274.7622680664062),
  (array([-0.428125, -0.934375]), 126.95727326393103)],
 [(array([-0.428125, -0.934375]), 126.95727326393103),
  (array([-0.70625, -0.86875]), 189.92759780883767),
  (array([-0.0765625, -0.7296875]), 55.26226650297613)],
 [(array([-0.0765625, -0.7296875]), 55.26226650297613),
  (array([-0.428125, -0.934375]), 126.95727326393103),
  (array([ 0.2015625, -0.7953125]), 70.5170610052344)],
 [(array([ 0.24980469, -0.29433594]), 13.289015859514404),
  (array([-0.05625, -0.51875]), 28.355092926025126),
  (array([0.04892578, 0.08115234]), 1.5248340609325481)],
 [(array([0.38398438, 0.11445313]), 0.48831503518156116),
  (array([0.27556152, 0.09621582]), 0.5659457084136962),
  (array([0.47282715, 0.1755127 ]), 0.508818537463478)],
 [(array([0.77835847, 0.60128807]), 0.05119871083161853),
  (array([0.7279919 , 0.53549367]), 0.0770370613014634),
  (array([0.81592701, 0.67518232]), 0.042804475278070055)],
 [(array([0.99931362, 0.99859385]), 5.856897110714607e-07),
  (array([1.00073873, 1.00159012]), 1.802909632650106e-06),
  (array([1.0006335 , 1.00126182]), 4.0444197650742854e-07)],
 [(array([0.99999515, 0.9999898 ]), 4.924856452216953e-11),
  (array([1.00000275, 1.00000666]), 1.439233320764842e-10),
  (array([1.0000062 , 1.00001188]), 6.576052030646928e-11)],
 [(array([1.00000123, 1.00000233]), 3.30711916255334e-12),
  (array([0.99999858, 0.99999692]), 8.18916976712111e-12),
  (array([1.00000081, 1.00000169]), 1.1140066839242728e-12)]]

type(ret_tris)

list

type(ret_tris[0][0][0])

numpy.ndarray

fx

1.1140066839242728e-12

len(ret_tris)

12

its

98