Search¶

In [45]:
import numpy as np
import matplotlib.pyplot as plt
In [46]:
%config InlineBackend.figure_format = 'png'

part 1¶

plot rosenbrock "banana" function

In [103]:
def f(x,y):
    return (1-x)**2 + 100*(y-x**2)**2
In [117]:
x = np.linspace(-2.5,2.5,100)
y = np.linspace(-2.5,2.5,100)
X,Y = np.meshgrid(x,y)
banana = f(X,Y)
# print(banana.shape)
# print(banana)
In [67]:
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))
ax.plot_surface(X,Y,banana,cmap='plasma')
plt.title("banana 2d")
plt.show()

part 2¶

simplex

In [119]:
# hmm. let's make a generic banana func... maybe not yet
def banana(x):
    dim = x.shape[1]
    length = x.shape[0]
#     print(dim)
#     print(length)
#     print(dim)
    f = np.zeros(length)
    for d in range(1,dim):
        f += ((1-x[:,d-1])**2 + 100*(x[:,d]-x[:,d-1]**2)**2)
    return f
In [122]:
# let's check it still works for dim 2
x = np.linspace(-2.5,2.5,100)
y = np.linspace(-2.5,2.5,100)
X,Y = np.meshgrid(x,y)
# print(X.shape)
stack = np.column_stack((X.flatten(),Y.flatten()))
# print(stack)
# print(stack.shape)
ban = banana(stack).reshape(x.shape[0],y.shape[0])
# print(ban)
# X,Y = np.meshgrid(x,y)
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))
ax.plot_surface(X,Y,ban,cmap='plasma')
plt.title("banana 2d - generic dimenension function")
plt.show()
In [325]:
def simplex(init,f,stop_cond=0.0001,max_iter=500):
    """
    init: starting point
    f: function to pass in
    stop_cond: stopping condition, stdev between simplex points
    max_iter: maximum iterations before breaking
    """
    dim = len(init) # what dimension are we in
    init_spacing = 0.25 # how to generate initial spacing
    # construct initial simplex:
    simplex = np.zeros((dim+1,dim)) # array of nodes -> dim+1 points in dim space
    simplex[0] = init # our first point
    # generate the next points: we just move out in each dimension by init spacing
    for d in range(dim):
        simplex[d+1] = np.array(init)
        simplex[d+1][d] += init_spacing
    
    # we add another column of function evaluations
    f_simplex = f(simplex)
    simplex = np.column_stack((simplex,f_simplex))
    
    stdev = np.sum(np.array(np.std(simplex,axis=0))) # the stopping condition
    i = 0
    
    # for plotting
    simplex_list = [simplex]
    
    while (stdev >=stop_cond) and (i < max_iter):
        # the main loop:
        # sort simplex according to f values
        sort_simplex = simplex[simplex[:, dim].argsort()[::-1]]

        x_mean = np.mean(sort_simplex[1:], axis=0)

        # Find reflection and evaluate
        x1_new = 2*x_mean - sort_simplex[0]
        x1_new[dim] = f(x1_new[:dim].reshape(1,dim))
        
        # if x1_new is the best, try reflecting double the distance
        if x1_new[dim] < sort_simplex[-1,dim]:
            x1_new_new = 3*x_mean - 2*sort_simplex[0]
            x1_new_new[dim] = f(x1_new_new[:dim].reshape(1,dim))
            if x1_new_new[dim] < x1_new[dim]:
                # if this is better, then this is our new simplex point
                sort_simplex[0] = x1_new_new
            else:
                # otherwise we use just the single level reflection
                sort_simplex[0] = x1_new
                #print('b')
        elif x1_new[dim] > sort_simplex[0,dim]:
            # if it gets worse
            # we try shrinking
            x1_new_new = 3/2*x_mean - 1/2*sort_simplex[0]
            x1_new_new[dim] = f(x1_new_new[:dim].reshape(1,dim))

            if x1_new_new[dim] < sort_simplex[0,dim]:
                # we accept this move
                sort_simplex[0] = x1_new_new
            else:
                # try shrinking again
                x1_new_new = 1/2*(x_mean-sort_simplex[0])
                x1_new_new[dim] = f(x1_new_new[:dim].reshape(1,dim))
                if x1_new_new[dim] <= sort_simplex[dim-1,dim]:
                    # we accept this move
                    sort_simplex[0] = x1_new_new
                else:
                    # shrink all vertices toward best one.
                    for d in range(dim):
                        sort_simplex[d] = 1/2*(sort_simplex[d] + sort_simplex[-1])
        else:
            # if it is neither best nor worst, we replace the worst with it and keep going.
            sort_simplex[0] = x1_new
        
        # Update our new simplex with the sorted version
        simplex = sort_simplex
        stdev = np.sum(np.array(np.std(simplex,axis=0)))
        i+=1 # step iterations forward
        #print(i)
        simplex_list.append(simplex)
    return simplex_list, stdev, i

#simplex([-1,-1],banana)
In [326]:
simplexes, stdev, iters = simplex([-1,-1],banana)
print(f"iterations = {iters}")
print(f"standard deviation = {stdev}")
print(f"final simplex nodes = {simplexes[-1]}")
iterations = 80
standard deviation = 5.786516701697217e-05
final simplex nodes = [[1.00011637e+00 1.00024133e+00 5.06170031e-08]
 [1.00016185e+00 1.00033461e+00 4.84447327e-08]
 [1.00014584e+00 1.00030609e+00 4.19529621e-08]]
In [328]:
# generate the function
x = np.linspace(-1.5,1.5,100)
y = np.linspace(-1.5,1.5,100)
X,Y = np.meshgrid(x,y)
stack = np.column_stack((X.flatten(),Y.flatten()))
ban = banana(stack).reshape(x.shape[0],y.shape[0])

# plot in 3D
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))

# banana function
ax.plot_surface(X,Y,ban,cmap='plasma',alpha=0.3)

# final simplex
final_sim = simplexes[-1]
ax.scatter(final_sim[:,0],final_sim[:,1],final_sim[:,2], c='r')

# preceeding simplexes
simplex_array = np.array(simplexes) # convert to array
# we duplicate the last point to close our triangles
simplex_plot = np.zeros((simplex_array.shape[0],4,3))
simplex_plot[:,:3,:] = simplex_array[:,:,:]
simplex_plot[:,3,:] = simplex_array[:,0,:]
# ok, now we're going to plot. 
for simp in simplex_plot:
    ax.plot(simp[:,0],simp[:,1],simp[:,2],'k-')

# simplex_array = np.column_stack((simplex_array,simplex_array[:,0]))

plt.title("banana 2d - downhill simplex")
plt.show()
In [330]:
import matplotlib.animation as animation
In [ ]:
# don't bother running the animation in the notebook... womp womp
In [340]:
# plot in 3D
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))

# banana function
ax.plot_surface(X,Y,ban,cmap='plasma',alpha=0.3)

# final simplex
final_sim = simplexes[-1]
ax.scatter(final_sim[:,0],final_sim[:,1],final_sim[:,2], c='r')

# preceeding simplexes
simplex_array = np.array(simplexes) # convert to array
# we duplicate the last point to close our triangles
simplex_plot = np.zeros((simplex_array.shape[0],4,3))
simplex_plot[:,:3,:] = simplex_array[:,:,:]
simplex_plot[:,3,:] = simplex_array[:,0,:]
# ok, now we're going to plot. 
# for simp in simplex_plot:
#     ax.plot(simp[:,0],simp[:,1],simp[:,2],'k-')

def update_simplex(i):
    ax.plot(simplex_plot[i,:,0],simplex_plot[i,:,1],simplex_plot[i,:,2],'k-')

# simplex_array = np.column_stack((simplex_array,simplex_array[:,0]))

# plt.title("banana 2d - downhill simplex")
# plt.show()

ani = animation.FuncAnimation(fig, update_simplex, range(iters), blit=False, interval=200, repeat=True)
plt.show()

part 3¶

10-D

In [348]:
init_x = -1*np.ones(10)
simplexes, stdev, iters = simplex(init_x,banana,max_iter=5000)
print(f"iterations = {iters}")
print(f"standard deviation = {stdev}")
print(f"final node vals = {simplexes[-1][-1]}")
iterations = 2438
standard deviation = 6.741947394245437e-05
final node vals = [9.99997972e-01 9.99990658e-01 9.99995978e-01 9.99994117e-01
 9.99983279e-01 9.99981523e-01 9.99973239e-01 9.99955118e-01
 9.99903423e-01 9.99808785e-01 8.53031775e-08]

so these are also converging towards all 1s? and one zero is from mystery extra dimension? i don't know how to visualize this. ¯_(ツ)_/¯

part 4¶

gradient descent

In [ ]:
# Let's copy our banana back again
def banana(x):
    dim = x.shape[1]
    length = x.shape[0]
    f = np.zeros(length)
    for d in range(1,dim):
        f += ((1-x[:,d-1])**2 + 100*(x[:,d]-x[:,d-1]**2)**2)
    return f
In [812]:
# Our gradient
def banana_grad(x):
    dim = x.shape[1]
    length = x.shape[0]
    f = np.zeros((length,dim))
    for i in range(1,dim-1):
        f[:,i] = (-2*(1-x[:,i]) - 400*x[:,i]*(x[:,i+1]-x[:,i])**2 + 200*(x[:,i]-x[:,i-1]**2))
    #f[:,0] = 400*x[:,0]*(x[:,1]-x[:,0])**2 - 2*(1-x[:,0])
    f[:,0] = - 2*(1-x[:,0]) - 400*x[:,0]*(x[:,1]-x[:,0]**2)
    f[:,-1] = 200*(x[:,-1]-x[:,-2]**2)
    return f
In [431]:
# check it sounds reasonable ish?
x = np.linspace(-2.5,2.5,100)
y = np.linspace(-2.5,2.5,100)
X,Y = np.meshgrid(x,y)
stack = np.column_stack((X.flatten(),Y.flatten()))
ban = banana(stack).reshape(x.shape[0],y.shape[0])
# ban_grad = banana_grad(stack)
In [828]:
def grad_descent(init_x,f,df,gamma=0.002,min_err=0.0001,max_iters=1000):
    # we start at init_x
    pos = init_x
    # first step
    pos_next = pos - gamma*df(pos)
    # rate of change will be difference between them
    err = abs(f(pos)-f(pos_next))
    # iterations
    pos_list = [pos]
    i = 0
    while (i<=max_iters) and (err> min_err):
        pos = pos_next
        pos_list.append(pos)
        pos_next = pos - gamma*df(pos)
        err = abs(f(pos)-f(pos_next))
        i+=1
    return pos, i, err,pos_list
init_x = np.array([[-1,-1]])
pos, i, err,pos_list = grad_descent(init_x,banana,banana_grad,min_err=0.00001)
In [829]:
print(pos)
print(i)
# generate the function
x = np.linspace(-1.5,1.5,100)
y = np.linspace(-1.5,1.5,100)
X,Y = np.meshgrid(x,y)
stack = np.column_stack((X.flatten(),Y.flatten()))
ban = banana(stack).reshape(x.shape[0],y.shape[0])

# plot in 3D
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))

# banana function
ax.plot_surface(X,Y,ban,cmap='plasma',alpha=0.3)

# final point
final_pt = pos
ax.scatter(final_pt[:,0],final_pt[:,1],banana(final_pt), c='r')

# the path there
for pt in pos_list:
    ax.plot(pt[0][0],pt[0][1],banana(pt)[0],c='k',ls = '-',linewidth=10,marker='x')

plt.title("banana 2d - gradient descent, gamma=0.002")
plt.show()
[[0.83974915 0.70447367]]
1001
In [817]:
def banana_grad_2d(x):
    dim = x.shape[1]
    assert dim ==2, f"dimension must be 2d, not {dim}"
    length = x.shape[0]
    f = np.zeros((length,dim))
    f[:,0] = - 2*(1-x[:,0]) - 400*x[:,0]*(x[:,1]-x[:,0]**2)
    f[:,1] = 200*(x[:,1]-x[:,0]**2)
    return f
In [827]:
def grad_descent_momentum1(init_x,f,df,alpha = 0.05,gamma=0.002,min_err=0.0001,max_iters=1000):
    # we start at init_x
    pos = init_x
    # first step
    pos_next = pos - gamma*df(pos)
    grad_prev = df(pos)
    # err will be difference between z heights
    err = abs(f(pos)-f(pos_next))
    # iterations
    pos_list = [pos]
    i = 0
    while (i<=max_iters) and (err> min_err):
        pos = pos_next
        pos_list.append(pos)
        pos_next = pos - gamma*df(pos) - alpha*grad_prev*gamma
        grad_prev = df(pos_next)
        err = abs(f(pos)-f(pos_next))
        i+=1
    return pos, i, err,pos_list
init_x = np.array([[-1,-1]])
pos, i, err,pos_list = grad_descent_momentum1(init_x,banana,banana_grad_2d,alpha=0.35,gamma = 0.002,min_err=0.00001,max_iters=1000)
print(pos)
print(i)
print(err)

# plot in 3D
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))

# banana function
ax.plot_surface(X,Y,ban,cmap='plasma',alpha=0.3)

# final point
final_pt = pos
ax.scatter(final_pt[:,0],final_pt[:,1],banana(final_pt), c='r')

# the path there
for pt in pos_list:
    ax.plot(pt[0][0],pt[0][1],banana(pt)[0],c='k',ls = '-',linewidth=10,marker='x')

plt.title("banana 2d - gradient descent, with momentum")
plt.show()
[[0.87635505 0.7682998 ]]
737
[8.95607747e-06]
In [837]:
def grad_descent_momentum(init_x,f,df,alpha = 0.05,gamma=0.002,min_err=0.0001,max_iters=1000):
    # we start at init_x
    pos = init_x
    # first step
    pos_next = pos - gamma*df(pos)
    # rate of change will be difference between them
    err = abs(f(pos)-f(pos_next))
    # iterations
    pos_list = [pos]
    i = 0
    while (i<=max_iters) and (err> min_err):
        pos = pos_next
        pos_list.append(pos)
        pos_next = pos - gamma*df(pos)
        pos_next += alpha*(np.random.rand(pos.shape[0],pos.shape[1])-0.5)#alpha*gamma*(df(pos_next)-df(pos))
        # randomly do something bad
        bad = np.random.rand()
        if bad< 0.85:
            if abs(np.sum(df(pos_next)-df(pos)))<1.75:
                pos_next+=alpha*(np.random.rand()-0.5)
            #print('h')
        if bad > 0.98:
            pos_next += 2*(np.random.rand()-0.5)
            #print('bad')
        err = abs(f(pos)-f(pos_next))
        i+=1
    return pos, i, err,pos_list
init_x = np.array([[-1,-1]])
pos, i, err,pos_list = grad_descent_momentum(init_x,banana,banana_grad_2d,alpha=0.07)
print(pos)
print(i)
print(err)

# plot in 3D
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))

# banana function
ax.plot_surface(X,Y,ban,cmap='plasma',alpha=0.3)

# final point
final_pt = pos
ax.scatter(final_pt[:,0],final_pt[:,1],banana(final_pt), c='r')

# the path there
for pt in pos_list:
    ax.plot(pt[0][0],pt[0][1],banana(pt)[0],c='k',ls = '-',linewidth=10,marker='x')

plt.title("banana 2d - gradient descent, with massive random jitters")
plt.show()
[[0.82356317 0.93651589]]
1001
[1.08819806]
In [846]:
def grad_descent_approx(init_x,f,gamma=0.02,min_err=0.00001,max_iters=5000):
    # we start at init_x
    pos = init_x
    # our fake gradient
    def df(pos):
        w = 0.35*np.random.normal(0, (0.25)**0.5, 2)
        grad = (f(pos+w) - f(pos))*w
        return grad
    # first step
    pos_next = pos - gamma*df(pos)
    # rate of change will be difference between them
    err = abs(f(pos)-f(pos_next))
    # iterations
    pos_list = [pos]
    i = 0
    while (i<=max_iters) and (err> min_err):
        pos = pos_next
        pos_list.append(pos)
        pos_next = pos - gamma*df(pos)
        while (abs(pos_next[0,0])>=1.5) or (abs(pos_next[0,1])>=1.5):
            pos_next = pos- gamma*df(pos)
        err = abs(f(pos)-f(pos_next))
        i+=1
    return pos, i, err,pos_list
init_x = np.array([[-1,-1]])
pos, i, err,pos_list = grad_descent_approx(init_x,banana)
print(pos)
print(i)
print(err)

# plot in 3D
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))

# banana function
ax.plot_surface(X,Y,ban,cmap='plasma',alpha=0.3)

# final point
final_pt = pos
ax.scatter(final_pt[:,0],final_pt[:,1],banana(final_pt), c='r')

# the path there
for pt in pos_list:
    ax.plot(pt[0][0],pt[0][1],banana(pt)[0],c='k',ls = '-',linewidth=10,marker='x')

plt.title("banana 2d - fake gradient descent")
plt.show()
[[0.23793656 0.03814389]]
104
[1.00855447e-06]
In [783]:
def grad_descent_baseline(init_x,f,baseline=0.0,gamma=0.02,min_err=0.0001,max_iters=1000):
    # we start at init_x
    pos = init_x
    # our fake gradient
    def df(pos):
        w = 0.1*np.random.normal(0, (0.25)**0.5, 2)
        grad = (f(pos+w) - baseline)*w
        return grad
    # first step
    pos_next = pos - gamma*df(pos)
    # rate of change will be difference between them
    err = abs(f(pos)-f(pos_next))
    # iterations
    pos_list = [pos]
    i = 0
    while (i<=max_iters) and (err> min_err):
        pos = pos_next
        pos_list.append(pos)
        pos_next = pos - gamma*df(pos)
        while (abs(pos_next[0,0])>=1.5) or (abs(pos_next[0,1])>=1.5):
            pos_next = pos- gamma*df(pos)
        err = abs(f(pos)-f(pos_next))
        i+=1
    return pos, i, err,pos_list
init_x = np.array([[-1,-1]])
pos, i, err,pos_list = grad_descent_baseline(init_x,banana,baseline=2)
print(pos)
print(i)
print(err)

# plot in 3D
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))

# banana function
ax.plot_surface(X,Y,ban,cmap='plasma',alpha=0.3)

# final point
final_pt = pos
ax.scatter(final_pt[:,0],final_pt[:,1],banana(final_pt), c='r')

# the path there
for pt in pos_list:
    ax.plot(pt[0][0],pt[0][1],banana(pt)[0],c='k',ls = '-',linewidth=10,marker='x')

plt.title("banana 2d - faker gradient descent")
plt.show()
[[ 0.01110981 -0.0931228 ]]
238
[4.14559127e-05]

10d gradient descent¶

In [848]:
init_x = -1*np.ones((1,10))
pos, i, err,pos_list = grad_descent(init_x,banana,banana_grad)
print(pos)
print(i)
print(err)
# ok ish 
[[0.9958193  0.99168183 0.98368305 0.96867959 0.94183782 0.89525658
  0.82368141 0.70417671 0.5662131  0.31220521]]
245
[9.36227659e-05]

Part 5¶

conjugate gradient

In [880]:
def conjugate_gradient(init_x,f,df, rate = 0.001, min_err=0.001, max_iters = 1000):
    pos = init_x
    d_i = df(pos) # initial direction will be the gradient
    g_i = df(pos)
    # next step
    pos_next = pos - rate*d_i
    #error
    err = abs(f(pos)-f(pos_next))
    # iterations
    pos_list = [pos]
    i = 0
    while (i<=max_iters) and (err> min_err):
        # advance our current position
        pos = pos_next
        pos_list.append(pos)
        # Find next gradient
        g_i_next = df(pos) # since we've updated already
        # find gamma
        #gamma_i = (g_i_next * (g_i_next-g_i))/(g_i*g_i)
        gamma_i = (np.dot(g_i_next, (g_i_next-g_i).T))/(g_i@g_i.T)
        # find next direction
        d_i = g_i_next + gamma_i*d_i
        # make next step
        pos_next = pos - rate*d_i
        #update gradients
        g_i = g_i_next
        # update errors and iteration
        err = abs(f(pos)-f(pos_next))
        i+=1
    return pos, i, err,pos_list

init_x = np.array([[-1,-1]])
pos, i, err,pos_list = conjugate_gradient(init_x, banana, banana_grad_2d,rate = 0.001, min_err=0.0001, max_iters = 5000)
print(pos)
print(i)
# plot in 3D
fig,ax = plt.subplots(subplot_kw=dict(projection='3d'))

# banana function
ax.plot_surface(X,Y,ban,cmap='plasma',alpha=0.3)

# final point
final_pt = pos
ax.scatter(final_pt[:,0],final_pt[:,1],banana(final_pt), c='r')

# the path there
for pt in pos_list:
    ax.plot(pt[0][0],pt[0][1],banana(pt)[0],c='k',ls = '-',linewidth=10,marker='x')

plt.title("banana 2d - conjugate gradient")
plt.show()
[[0.72212234 0.52015854]]
1282
In [885]:
x = np.linspace(-1,1,500)
y = np.linspace(-1,1,500)
X,Y = np.meshgrid(x,y)
stack = np.column_stack((X.flatten(),Y.flatten()))
ban = banana(stack).reshape(x.shape[0],y.shape[0])
fig,ax = plt.subplots()
# for pt in pos_list:
#     ax.plot(pt[0,0],pt[0,1],c='k',ls = '-',linewidth=10,marker='.')
pos_array = np.array(pos_list).reshape(1283,2)
ax.contourf(X,Y,ban,100,cmap='plasma',alpha=0.85)
ax.plot(pos_array[:,0],pos_array[:,1],'k-')
plt.title("contour version - conjugate gradient descent")
plt.show()

part ??¶

CMAES

In [875]:
def CMAES(trawlers,f,B,rate = 0.001, max_iters=100):
    trawlers_next = np.zeros_like(trawlers)
    time = len(B)
    N = trawlers.shape[0] # how many of them
    dim = trawlers.shape[1]
    for t in range(time):
        temperature = B[t]
        next_move = rate*(np.random.rand(N,dim)-0.5)
        next_move[np.nonzero((next_move.any()>0) and (np.random.rand()>temperature))] = 0 # reject bad moves
        trawlers_next = trawlers + next_move
        trawlers = trawlers_next
    min_point = np.amin(f(trawlers),axis=0)
    return min_point
trawlers = np.random.rand(10,2)-0.5 # starting positions (2D)
B = np.zeros(100) # temperature
pos = CMAES(trawlers,banana,B)
print(pos)
1.0759811512047548