%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
def custom_svd(mat, **options):
u, s, v = np.linalg.svd(mat, **options)
w_i = np.zeros((len(v),len(u)))
w_i[:len(s),:len(s)] = np.diag(1/s)
return u, w_i, v, s
# Generating 100 randomly distributed points and building the dataset
x = np.random.uniform(low=0, high=1, size=100)
y = 2 + 3*x + np.random.normal(loc=0.0, scale=.5, size=100)
# Using SVD to solve y = a + bx
A = np.column_stack((np.ones_like(x),x))
B = y
U, Winv, V, s = custom_svd(A, full_matrices=True)
params = np.matmul(V.T@Winv@U.T, B)
# Calculate error with equation (12.34)
erra=np.sum(V**2/s,axis=0)*.5
# Calculate error by bootstrap
params_ensemble = np.zeros((100, 2))
for i in range(100):
idx = np.random.choice(np.arange(len(x)), 100, replace=True)
x_sample = x[idx]
y_sample = y[idx]
# Using SVD to solve y = a + bx
A = np.column_stack((np.ones_like(x_sample),x_sample))
B = y_sample
U, Winv, V, s = custom_svd(A, full_matrices=True)
params_ensemble[i] = np.matmul(V.T@Winv@U.T, B)
errb=np.var(params_ensemble, axis=0)
# Calculate error from fitting ensembles
params_ensemble = np.zeros((100, 2))
for i in range(100):
x = np.random.uniform(low=0, high=1, size=100)
y = 2 + 3*x + np.random.normal(loc=0.0, scale=.05, size=100)
# Using SVD to solve y = a + bx
A = np.column_stack((np.ones_like(x),x))
B = y
U, Winv, V, s = custom_svd(A, full_matrices=True)
params_ensemble[i] = np.matmul(V.T@Winv@U.T, B)
errc=np.var(params_ensemble, axis=0)
plt.scatter(x,y, alpha=.5, label='data')
plt.plot(x, params[1]*x + params[0], color='red', label='SVD model')
plt.legend()
plt.title('Fitting with SVD')
plt.show()
print("Error Variances:")
print("(a) a: %f, b: %f" % tuple(erra) )
print("(b) a: %f, b: %f" % tuple(errb) )
print("(c) a: %f, b: %f" % tuple(errc) )
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import theano as th
import theano.tensor as T
# Generating 100 randomly distributed points and building the dataset
x_init = np.random.uniform(low=0, high=1, size=100)
y_init = np.sin(2 + 3*x_init) + np.random.normal(loc=0.0, scale=.1, size=100)
class LevenbergMarquardt():
def __init__(self, a_init=1, b_init=1, l=1, adaptive=False):
# Preparing Theano functions for the calculations of the gradient, the Hessian and the loss
a_s = T.scalar()
b_s = T.scalar()
x_s = T.vector()
y_s = T.vector()
yt = T.sin(a_s + b_s*x_s)
loss = T.sum((y_s-yt)**2)
grada = T.grad(cost=loss, wrt=a_s)
gradb = T.grad(cost=loss, wrt=b_s)
self.loss_fun = th.function(inputs=[x_s, y_s, a_s, b_s], outputs=[loss], allow_input_downcast=True)
self.grad_fun = th.function(inputs=[x_s, y_s, a_s, b_s], outputs=[grada, gradb], allow_input_downcast=True)
self.a = [a_init]
self.b = [b_init]
self.l = [l]
self.error = []
self.adaptive = adaptive
def solve(self, x, y, steps=1000):
self.error.append(self.loss_fun(x, y, self.a[-1], self.b[-1]))
# Perform iterative minimization of error
for step in range(steps):
# Compute gradient
G = np.asarray(self.grad_fun(x, y, self.a[-1], self.b[-1])).squeeze()
# Compute Hessian
H = 2*np.array([[np.sum(np.cos(self.a[-1]+self.b[-1]*x)**2),
np.sum(x*(np.cos(self.a[-1]+self.b[-1]*x)**2))],
[np.sum(x*(np.cos(self.a[-1]+self.b[-1]*x)**2)),
np.sum(x*x*(np.cos(self.a[-1]+self.b[-1]*x)**2))]])
# Compute M matrix
M = .5*H*np.array([[1+self.l[-1], 1], [1, 1+self.l[-1]]])
# Update parameters
D = -1 * np.matmul(np.linalg.inv(M), G.T)
self.a.append(self.a[-1] + D[0])
self.b.append(self.b[-1] + D[1])
# Check new error and update lambda if adaptive
new_error = self.loss_fun(x, y, self.a[-1], self.b[-1])[0]
if self.adaptive:
if new_error < self.error[-1]:
self.l.append(self.l[-1]*.75)
else:
self.l.append(self.l[-1]*1.25)
self.error.append(new_error)
return self.a, self.b, self.l, self.error
def print_stats(x, y, a, b, l, error):
print('Estimation: a=%f, b=%f' % (a[-1], b[-1]))
print('Error: %f' % error[-1])
print("")
plt.plot(np.arange(0,1,.001), np.sin(a[-1]+b[-1]*np.arange(0,1,.001)), color='red', label='LM Model')
plt.scatter(x, y, label='data')
plt.title('Estimated function')
plt.show()
plt.plot(np.arange(len(a)), a)
plt.title("Parameter 'a'")
plt.show()
plt.plot(np.arange(len(b)), b)
plt.title("Parameter 'b'")
plt.show()
print('Non-adaptive, lambda = 1')
lm_solver1 = LevenbergMarquardt(a_init=1, b_init=1, l=1, adaptive=False)
a, b, l, error = lm_solver1.solve(x_init, y_init)
print_stats(x_init, y_init, a, b, l, error)
print('Adaptive, lambda = 1')
lm_solver2 = LevenbergMarquardt(a_init=1, b_init=1, l=1, adaptive=True)
a, b, l, error = lm_solver2.solve(x_init, y_init)
print_stats(x_init, y_init, a, b, l, error)
print('Non-adaptive, lambda = 100')
lm_solver3 = LevenbergMarquardt(a_init=1, b_init=1, l=1, adaptive=False)
a, b, l, error = lm_solver3.solve(x_init, y_init)
print_stats(x_init, y_init, a, b, l, error)
print('Adaptive, lambda = 100')
lm_solver4 = LevenbergMarquardt(a_init=1, b_init=1, l=1, adaptive=True)
a, b, l, error = lm_solver4.solve(x_init, y_init)
print_stats(x_init, y_init, a, b, l, error)
We want to maximize the quantity $S = - \int_{-\infty}^{+\infty}p(x)\log p(x)dx$ w.r.t $p(x)$. Actually, this is a constraint maximization problem, since we want
a) $p(x)$ to be a probability distribution, and thus $\int_{-\infty}^{+\infty}p(x)dx = 1$
and b) $p(x)$ to give the measured expectation values for all the functions $f_i$, i.e. $\langle f_i(x) \rangle = \int_{-\infty}^{+\infty}p(x)f_i(x)dx $.
We are going to solve this constraint optimization problem using Lagrange multipliers.
The Lagrangian is, $$\mathcal{L} = - \int_{-\infty}^{+\infty}p(x)\log p(x)dx + \lambda_0\int_{-\infty}^{+\infty}p(x)dx + \sum_i\lambda_i\int_{-\infty}^{+\infty}p(x)f_i(x)dx$$.
To find the maximum, we want the derivative w.r.t. $p$ to be $0$. That is $$ \frac{d}{dp}\mathcal{L} = 0 \\ \frac{d}{dp}\Big[ - \int_{-\infty}^{+\infty}p(x)\log p(x)dx + \lambda_0\int_{-\infty}^{+\infty}p(x)dx + \sum_i\lambda_i\int_{-\infty}^{+\infty}p(x)f_i(x)dx \Big] = 0\\ $$
Applying the chain rule, we have, $$ \frac{\partial}{\partial p}\frac{\partial p}{\partial x}\Big[ - \int_{-\infty}^{+\infty}p(x)\log p(x)dx + \lambda_0\int_{-\infty}^{+\infty}p(x)dx + \sum_i\lambda_i\int_{-\infty}^{+\infty}p(x)f_i(x)dx \Big] = 0\\ \frac{\partial}{\partial p}[-p\log p + \lambda_0 p + \sum_i\lambda_i p f_i(x)] = 0 \\ -1 -\log p + \lambda_0 + \sum_i\lambda_i f_i(x) = 0\\ $$
From where we get the solution, $$p(x) = e^{- 1 + \lambda_0 + \sum_i\lambda_i f_i(x)}$$
Using the constraints and by substiting $p(x)$ we can find the multipliers.
We can use the above sultion, by setting $f_1(x) = x^2$.
Then, the maxent distribution is $p(x) = e^{- 1 + \lambda_0 + \lambda_1 x^2}$.
Now let's find the multipliers. At first, we want $$\begin{align} \int_{-\infty}^{+\infty}p(x)dx &= 1 \\ \int_{-\infty}^{+\infty}e^{- 1 + \lambda_0 + \lambda_1 x^2}dx &= 1 \\ e^{- 1 + \lambda_0}\int_{-\infty}^{+\infty}e^{\lambda_1 x^2}dx &= 1 \end{align}$$
Solving the infinite integral (analytically) with Wolfram Alpha gives us the first equation about the multipliers (we need two): $\lambda_1 = \pi e^{-2(1-\lambda_0)}$.
From the second constraint we get, $$\begin{align} \int_{-\infty}^{+\infty}p(x)x^2dx &= \sigma^2 \\ \int_{-\infty}^{+\infty}e^{- 1 + \lambda_0 + \lambda_1 x^2}x^2dx &= \sigma^2 \\ e^{- 1 + \lambda_0}\int_{-\infty}^{+\infty}e^{\lambda_1 x^2}x^2dx &= \sigma^2 \end{align}$$
Again, using (the amazing) Wolfram Alpha to solve (analytically) the integral we get the second equation, which is $\lambda_1^3 = \frac{\pi e^{-2(1-\lambda_0)}}{4 \sigma^2} $.
Substituting the first equation into the second, we get $\lambda_1^2 = \frac{1}{4 \sigma^2} \Rightarrow \lambda_1 = \frac{1}{2\sigma^2}$.
Consequently, solving for $\lambda_0$, we get $\lambda_0 = -\log\sqrt{2 \pi \sigma^2} +1$.
Finally, plugging the multipliers back, we now have the maximum entropy distribution for a given variance, i.e. $$p(x) = e^{- 1 -\log\sqrt{2 \pi \sigma^2} + 1 + \frac{1}{2\sigma^2} x^2} \\ p(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{\frac{x^2}{2\sigma^2}}$$
Wow, after all this math, it turns out that the maxent distribution for a given variance $\sigma^2$, is a Gaussian with zero mean and variance $\sigma^2$.
If $f$ the desired estimator, then we have to show that the expected value of $f$ is equal to $\mu$ (no bias) and that $f$ has the smallest possible error, i.e. $\sigma^2(f) - \frac{1}{J_N(\mu)}$, known as the Cramer-Rao bound.
For the second proof, we calucate beforehand the quantity $\frac{1}{J_N(\mu)}$.
By definition $p_\mu(x) = \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}$.
Thus, $V = \frac{\partial}{\partial \mu}\log{p_\mu(x)}= \dots = \frac{1}{\sigma^2}(x-\mu)$ and, in turn, $J(\mu)=\langle V^2 \rangle = \frac{1}{\sigma^4}\langle (x-\mu)^2 \rangle = \frac{\sigma^2}{\sigma^4} = \frac{1}{\sigma^2}$. Finally, $J_N(\mu) = N J(\mu) = \frac{N}{\sigma^2}$.
Ouf of intuition, let's select the statistical mean as an estimator, i.e. $f(\mathbf{X}) = \frac{1}{N}\sum_{n=1}^{N}x_n$. Now let's prove it is optimal.
Firstly, $\langle f\rangle = \Big\langle \frac{1}{N}\sum_{n=1}^{N}x_n \Big\rangle = \frac{1}{N}\sum_{n=1}^{N}\langle x_n\rangle = \frac{N\mu}{N} = \mu.$
Furthermore, $$\begin{align} \sigma^2(f) &=\Big\langle [(\frac{1}{N}\sum_{n=1}^{N}x_n)-\mu]^2\Big\rangle \\ &= \Big\langle \frac{1}{N}\sum_{n=1}^{N}(x_n-\mu)^2\Big\rangle \\ &= \frac{1}{N} \Big\langle \sum_{n=1}^{N}(x_n-\mu)^2\Big\rangle \\ &= \frac{\sigma^2}{N} \\ &= \frac{1}{J_N(\mu)} \end{align}$$
Conclusively, $f$ is an optimal estimator.