Levenberg– Marquardt assumes that it is possible to calculate both the first and second derivatives of the function to be minimized, that the starting parameter values are near the desired extremum of the cost function, and that the function is reasonably smooth. We’ll see blobs (for the downhill simplex search), mass (momentum for avoiding local minima), temperature (via simulated annealing), and reproduction (with genetic algorithms).
Equations that are created by numerical methods are impossible to exactly approdimate analytically.
After each function evaluation there should be some kind of update of the search to make sure that the next func- tion evaluation is as useful as possible.
A simplex is a geometrical figure that has one more vertex than dimension: a triangle in 2D, a tetrahedtron in 3D, and so forth
https://www.youtube.com/watch?v=2eCdWJ59iI4 The first step is to calculate the center of the face of the simplex defined by all of the vertices other than the one we’re trying to improve:. Take the mean of all xs such taht the piont is not the bigest one. Update x1 = 2xmean - x1 (to maek it move inthe right direciotn. Chekc if f(3xmean - 2x1) is better than f(x1) . You can just keep refelccting to keep moving. However, if reflecting and transforming x1 actually maeks it worst, then you want to shrink it. x1_new = 3/2xmean - 1/2 x1 If fxnew < fx2 (it's better than the next worse), accept the move and try to improvef(X2), If after reflecitng and shrinking f(x1new) is still wrose, then try this: x1_new =1/2(mean +x1) Then, the moves we're making are too big to find minimum. Then give up and shrink all vertices towards the best ne. xi = 1/2(xi + xD_1(
Ther'e sno stopping algorithm. Tumbling down a hill.
1a) Plot the Rosenbrock function f = (1-x)^2 + 100(y-x^2)^2
from IPython.display import Image
from IPython.display import FileLink, FileLinks
from matplotlib import pyplot as plt
Image(filename='files/81a.jpg')
The code for the above is here This function is a non-convex function used as a performance test problem for optimization algorithms)
b) Pick a stopping criterion and use the downhill simplex method to search for its minimum starting from x = y = −1, plotting the search path, and comparing the computing time and memory used.
from mpl_toolkits.mplot3d import axes3d, Axes3D
def plot_data(n,S,title,simplex_flag):
S = np.array(S)
print(np.shape(S))
t1 = title
font = {'family': 'monospace',
'color': '#39393d',
'weight': 'light',
'size': 14,
}
fig = plt.figure(figsize=(10,6), facecolor="white")
ax = fig.add_subplot(111, projection='3d')
ax.set_title(t1, fontdict=font)
ax.set_xlabel('x',fontdict=font)
ax.set_ylabel('y',fontdict=font)
X = []
Y = []
Z = []
g = 2.0
for i in range(int(-n/2),int(n/2)):
X.append(np.ones(n)*(i+1)*(g/n))
Y.append(np.linspace(-g/2, g/2, n))
for i in range(len(X)):
x = X[i]
y = Y[i]
Z.append((1-x)**2 + 100*(y-x**2)**2)
ax.plot_surface(X,Y,Z,color="red")
if simplex_flag == True:
print("len(S) = " + str(len(S)))
print(S[0])
print(len(S))
print(S)
for i in range(len(S)):
x1 = S[i][0][0]; y1 = S[i][0][1];
x2 = S[i][1][0]; y2 = S[i][1][1];
print(S[i][0][0])
x3 = S[i][2][0]; y3 = S[i][2][1];
print(S[i][0][0])
ax.plot([x1,x2],[y1,y2],[f(S[i][0][0], S[i][0][1]),f(S[i][1][0], S[i][1][1])],color="black")
ax.plot([x2,x3],[y2,y3],[f(S[i][1][0],S[i][1][1] ),f(S[i][2][0], S[i][2][1])],color="black")
ax.plot([x3,x1],[y3,y1],[f(S[i][2][0],S[i][2][1]),f(S[i][0][0],S[i][0][1])],color="black")
else:
print("Final point = " + str(S[-1]))
height = f_np_array(S[:,0],S[:,1])
ax.plot(S[:,0],S[:,1],height,color="black")
# rotate the view shown in the plot
ax.view_init(elev=50., azim=60)
plt.show()
import numpy as np
def create_simplex(init_point, step):
x = np.zeros((len(init_point)+1,len(init_point)))
x[0] = init_point
for i in range(1,len(init_point)+1): # has one more vertex than dimensions
x[i] = init_point
x[i,i-1] = init_point[i-1] + step # this has x and y coordinate
# This has an x and y
new_x = []
for i in range(len(x)):
new_x.append([x[i], f(x[i][0], x[i][1])])
print(new_x)
return new_x
def shrink_all(x, f):
# srhink all towards the best vertex
for i in range(len(x)):
x[i][0] = (1/2)*(x[i][0] + x[len(x) -1][0])
x[i][1] = f(x[i][0][0], x[i][0][1])
return x
def rank_vertices(x):
x.sort(key=lambda x: x[1])
x.reverse()
return x
def downhill_simplex(f, init_point, step):
first_point = init_point
x = create_simplex(first_point, step)
prev_best = f(init_point[0], init_point[0])
state = []
for i in range(100): # think about stopping condition
x = rank_vertices(x)
print("best so far")
print(x[len(x)-1][1])
prev = x
best = x[-1][1]
worst = x[0][1]
second = x[1][1]
if ((prev_best - best) < 0.000000001):
print("done")
#return best
x_mean = [0.] * 2
for tup in x[1:]:
for i, c in enumerate(tup[0]):
x_mean[i] += c / (len(x)-1)
p1 = x_mean + 1*(x_mean- x[0][0])
f_p1 = f(p1[0], p1[1])
prev_best = best
if (f_p1 <= best): # clearly best, try seeing if doubling helps
p2 = x_mean + 2*(x_mean - x[0][0])
f_p2 = f(p2[0], p2[1])
if (f_p2 < f_p1):#update x.
x = x[1:]
x.append([p2, f_p2])
else:
x = x[1:]
x.append([p1, f_p1])
elif (f_p1 <= second):
temp = x[1]
x[1] = [p1, f_p1]
x[0] = temp
else:
p3 = x_mean + ((1/2)*(x_mean- x[0][0]))
f_p3 = f(p3[0], p3[1])
if (f_p3 > second):
p4 = x_mean - ((1/2)*(x_mean- x[0][0]))
f_p4 = f(p4[0], p4[1])
if (f_p4 > second):
x = shrink_all(x,f)
else:
temp = x[1]
x[1] = [p4, f_p4]
x[0] = temp
else:
temp = x[1]
x[1] = [p3, f_p3]
x[0] = temp
state.append(x)
new_s = []
for i in state:
new = []
for j in i:
new.append(j[0])
new_s.append(new)
#plot_data(50, new_s, "Nelder", True)
return x[-1] # gives the best value of
init_point = np.array([-1, -1])
def f(x, y):
r = pow((1-x), 2) + (100*pow((y-pow(x, 2)), 2))
return r
print(downhill_simplex(f, init_point, 0.5))
Image(filename='files/81b.jpg')
c) Repeat with the direction set method.
Image(filename='files/direction-set1.jpg')
Image(filename='files/direction-set2.jpg')
"""
Notes on the direciotn set method above
"""
from math import sqrt
phi = 2/(1 + sqrt(5))
# a and b are the current bounds; the minimum is between them.
# c is the center pointer pushed slightly left towards a
def bracketing(x_init, d):
# finds an enclosing set of 2 points with one in the middle
# takes a starting point and direction vector
# this uses the bracketing
x1 = x_init
y1 = f(x1[0], x1[1])
print(x1)
print(d)
x2 = x1 + d
y2 = f(x2[0], x2[1])
if y2 > y1:
xtemp = x1
ytemp = y1
x1 = x2
y1 = y2
x2 = xtemp
y2 = ytemp
x3 = x2 + (x2 - x1) * phi
y3 = f(x3[0], x3[1])
while y3 < y2:r
x1 = x2
y1 = y2
x2 = x3
y2 = y3
x3 = x2 + (x2 - x1) * phi
y3 = f(x3[0], x3[1])
return (x1, x2, x3)
def line_min(f,x1,x2, x3, x0 ,ep):
if (abs(x3 - x1) < ep):
return (x3 + x1)/2
x2_app = x0* x2* phi
d = x2 + phi*(x3 - x2)
sub = lambda x,y: pow(x-y, 2)
f2 = f(x2_app[0], x2_app[1])
s1 = sub(x1, x2)
s2 = sub(x2, x3)
if (s1 > s2): # this was really helpful: http://informatik.unibas.ch/fileadmin/Lectures/HS2013/CS253/PowellAndDP1.pdf
x4 = x1 + phi*(x2-x1)
x4_app = x4 * x0
f4 = f(x4_app[0], x4_app[1])
if (f4 < f2):
return line_min(f,x1, x4, x2,x0, ep)
else:
return line_min(f, x4, x2, x3,x0, ep)
else:
x4 = x2 + phi*(x3-x2)
x4_app = x4 * x0
f4 = f(x4_app[0], x4_app[1])
if (f4 < f2):
return line_min(f, x2, x4 , x3, x0, ep)
else:
return line_min(f, x1, x2,x4, x0, ep)
"""
Not yet working, working on Powell cureently and how it involves racketing.
"""
def powell(x0, f, direction ,alpha=1.,eps=1e-4,max_iter=100):
init_f = f(x0[0], x0[1])
i=0
preparing_p = bracketing(x0, direction[0])
x1 = preparing_p[0] / x0
x1 = x1[0]
x2 = preparing_p[1]/x0
x2 = x2[0]
x3 = preparing_p[2]/x0
x3 = x3[0]
print(preparing_p)
p2 = line_min(f, x1 , x2,x3,x0, 0.1)
p2 = p2* x0
f2 = f(p2[0], p2[1])
preparing_p = bracketing(p2, direction[1])
x1 = preparing_p[0] / x0
x1 = x1[0]
x2 = preparing_p[1]/x0
x2 = x2[0]
x3 = preparing_p[2]/x0
x3 = x3[0]
p3 = line_min(f, x1 , x2,x3,x0, 0.1)
print("p3")
print(p3)
p3 = p3* x0
f3 = f(p3[0], p3[1])
delta_f = p3 - x0
contri_1 = f2- init_f
contri_2 = f3 - f2
if (contri_1 > contri_2):
direction = [contri_2, delta_f]
else:
direction = [contri_1, delta_f]
return direction, p3
x0 = np.array([-1,-1])
step = 0.5
x = np.array([x0])
n_dim = np.shape(x)[-1]
d = np.eye(n_dim)
x = powell( np.array([-1,-1]),f, d)
print("and the point at the end is")
print(x[0][1][0], x[0][1][1])
print("the minimum point is")
print(f(x[0][1][0], x[0][1][1]))
Image(filename='files/con-grad1.jpg')
Image(filename='files/con-grad2.jpg')
# Work above
def get_A(x):
x = x[0]; y = x[1]
A = [[[2+1200*x**2 - (400*y),-400*x],[-400*x,200]]]
return A
def b(P):
x = P[0]; y = P[1]
return [[-2+(2*x)-(400*y*x)+400*x**3,200*y-(200*(x**2))]]
def g(A,x,b):
return np.dot(A,x) - b
def gamma(A,d,g):
gamma = - np.dot(g,np.dot(A,d))/np.dot(d,np.dot(A,d))
return gamma
def d_new(g,gamma,d):
new_d = g + np.dot(gamma,d)
return new_d
# Read more into how this is inocorpoate with line minimization
# .https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf
You can add "momentum" to avoid finding local minima.
Evolution works by using a large population to explore many options in parallel, rather than concentrating on trying many changes around a single design. Techniques that do this have come to be called genetic algorithms, or GAs, by analogy with the evolutionary update of the genome. Steps: FItness - A set of parameters with a low fitness might disappear, and one with a high fitness can be duplicated many times. Crossover- Ensembles share parameters At the opposite extreme is a function with very many equally good local minima. - Annealing is good here to get out of the smallerminimas into the good ones. GAs is if you want to epxlore all the optionsposisble,rtahter than improving one set of pramaeters.
It' shard to create an algorithm that can search a space givne a small landscape. If a model is not going to have a small number of meaningful parameters, then the best thing to do is to give it so many adjustable parameters that there’s no trouble finding a good solution, and prevent overfitting by imposing priors.