# Chapter 14 - Function Architectures¶

### (14.1) Find the first five diagonal Padé approximants $[1/1],\dots,[5/5]$ to $e^x$ around the origin. Remember that the numerator and denominator can be multiplied by a constant to make the numbers as convenient as possible. Evaluate the approximations at $x=1$ and compare with the correct value of $e=2.718281828459045$. How is the error improving with the order?¶

Before we proceed to compute the approximants in Python, let's rewrite the equations (14.7) into its matrix form.

For an $[N/M]$ order polynomial, we have for $l = N+1,.\dots,N+M$ \begin{align} 0 &= c_l + \sum_{m=1}^{N}b_m c_{l-m} \\ \begin{bmatrix}0 \\ 0 \\ \vdots \\ 0 \end{bmatrix} &= \begin{bmatrix}c_{N+1} \\ c_{N+2} \\ \vdots \\ c_{N+M} \end{bmatrix} + \begin{bmatrix} c_N & c_{N-1} & \dots \\ \vdots & \ddots & \\ c_{N+1-M} & & c_{N} \end{bmatrix} \begin{bmatrix}b_1 \\ b_2 \\ \vdots \\ b_M \end{bmatrix} \\ \begin{bmatrix}b_1 \\ b_2 \\ \vdots \\ b_M \end{bmatrix} &= \begin{bmatrix} c_N & c_{N-1} & \dots \\ \vdots & \ddots & \\ c_{N+1-M} & & c_{N} \end{bmatrix}^{-1} \begin{bmatrix}c_{N+1} \\ c_{N+2} \\ \vdots \\ c_{N+M} \end{bmatrix} \end{align}

Also, the Taylor expansion for $e^x$ is $1 + \sum_{m=1}^{\infty}\frac{x^m}{m!}$.

In [1]:
# Finding Pade approximants
import numpy as np

# Calculate Taylor expansion for exp up to N+M order
c = 1. / np.insert(np.cumprod(range(1, M+N+1), dtype=np.float32),0,1)
# Calculate b coefficients by doing the matrix multiplication described above
C = np.zeros((M, M))
for i, l in enumerate(range(N, N+M)):
C[i] = c[l:l-M:-1]
#     print(C)
#     print(c[N+1:N+M+1])
b = - np.linalg.inv(C) @ c[N+1:N+M+1]
b = np.insert(b, 0, 1) # b_0 = 1
#     print(b)

# Calculate a coefficients
a = np.zeros_like(b)
a[0] = c[0]
for n in range(1, N+1):
a[n] = c[n] + b[1:n+1]@np.flip(c[:n], 0)
print(a)

if x:
x = [x**i for i in range(0, N+M+1)]
y = a@x[:N+1] / (b@x[:M+1])
err = np.abs(np.exp(x[1]) - y)

# Scaling coefficients to have the constant term equal to 1
scale = np.min(np.abs(a))
return a/scale, b/scale, y, err
else:
return a, b

In [2]:
np.set_printoptions(precision=2)
for i in range(1,6):
a, b, y, err = exp_pade_approximant(i, i, x=1.)
print('For the [{0}/{0}] approximant'.format(i))
print('-----------------------------')
print('Error: %.2g' % err)
print('a', a)
print('b', b)

[ 1.   0.5]
For the [1/1] approximant
-----------------------------
Error: 0.28
a [ 2.  1.]
b [ 2. -1.]
[ 1.    0.5   0.08]
For the [2/2] approximant
-----------------------------
Error: 0.004
a [ 12.   6.   1.]
b [ 12.  -6.   1.]
[ 1.    0.5   0.1   0.01]
For the [3/3] approximant
-----------------------------
Error: 2.8e-05
a [ 120.   60.   12.    1.]
b [ 120.  -60.   12.   -1.]
[  1.00e+00   5.00e-01   1.07e-01   1.19e-02   5.95e-04]
For the [4/4] approximant
-----------------------------
Error: 1e-07
a [  1.68e+03   8.40e+02   1.80e+02   2.00e+01   1.00e+00]
b [  1.68e+03  -8.40e+02   1.80e+02  -2.00e+01   1.00e+00]
[  1.00e+00   5.00e-01   1.11e-01   1.39e-02   9.92e-04   3.31e-05]
For the [5/5] approximant
-----------------------------
Error: 7e-09
a [  3.02e+04   1.51e+04   3.36e+03   4.20e+02   3.00e+01   1.00e+00]
b [  3.02e+04  -1.51e+04   3.36e+03  -4.20e+02   3.00e+01  -9.99e-01]


### (14.2) Take as a dataset $x = {-10, -9, \dots, 9, 10}$, and $y(x)=0$ if $x\leq 0$ and $y(x)=1$ if $x>0$.¶

In [3]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')

x_train = np.arange(-10,10,1)
y_train = (x_train>0).astype(np.float32)
x_valid = np.arange(-10.5,10.5,1)
y_valid = (x_valid>0).astype(np.float32)
x_vis = np.arange(-10.5, 10.5, .1)


#### (a) Fit the data with a polynomial with 5, 10 and 15 terms, using a pseudo-inverse of the Vandermonde matrix.¶

In [4]:
def fit_polynomial(x_train, y_train, x_valid, y_valid, x_vis, order):
# Create Vandermonde matrix
A = np.zeros((order, len(x_train)))
for i in range(order):
A[i] = x_train**i

c = np.linalg.pinv(A.T)@y_train

A = np.zeros((order, len(x_valid)))
for i in range(order):
A[i] = x_valid**i

error = np.mean(np.square(y_valid - A.T@c))

A = np.zeros((order, len(x_vis)))
for i in range(order):
A[i] = x_vis**i

y_vis = A.T@c

return error, y_vis

In [5]:
plt.figure()
plt.scatter(x_train, y_train, color='k', marker='x')
plt.xlim([-10.5, 10.5])
plt.ylim([-.5, 1.5])
for order in [5, 10, 15]:
error, y_vis = fit_polynomial(x_train, y_train, x_valid, y_valid, x_vis, order)
plt.plot(x_vis, y_vis, label='%dth Order, Error: %.3f' % (order, error))
plt.legend()
plt.show()


#### (b) Fit the data with 5, 10, and 15 $r^3$ RBFs uniformly distributed between $x=-10$ and $x=-10$.¶

In [6]:
def fit_rbf(x_train, y_train, x_valid, y_valid, x_vis, order):

# Position RBFs
c = np.arange(-10, 10, 20//order)

# Create RBF matrix
R = np.zeros((order, len(x_train)))
for i in range(order):
R[i] = np.abs(x_train - c[i])**3

a = np.linalg.pinv(R.T)@y_train

R = np.zeros((order, len(x_valid)))
for i in range(order):
R[i] = np.abs(x_valid - c[i])**3

error = np.mean(np.square(y_valid - R.T@a))

R = np.zeros((order, len(x_vis)))
for i in range(order):
R[i] = np.abs(x_vis - c[i])**3

y_vis = R.T@a

return error, y_vis

In [7]:
plt.figure()
plt.scatter(x_train, y_train, color='k', marker='x')
plt.xlim([-10.5, 10.5])
plt.ylim([-.5, 1.5])
for order in [5, 10, 15]:
error, y_vis = fit_rbf(x_train, y_train, x_valid, y_valid, x_vis, order)
plt.plot(x_vis, y_vis, label='%dth Order, Error: %.3f' % (order, error))
plt.legend()
plt.show()


#### (c) Using the coefficients found for these six fits, evaluate the total out-of-sample error at $x={-10.5, -9.5, \dots, 9.5, 10.5}$.¶

In [8]:
plt.figure()
polynomial_err = [fit_polynomial(x_train, y_train, x_valid, y_valid, x_vis, order)[0] for order in [5, 10, 15]]
rbf_err = [fit_rbf(x_train, y_train, x_valid, y_valid, x_vis, order)[0] for order in [5, 10, 15]]
plt.plot([5, 10, 15], polynomial_err, label='Polynomial Fit', marker='o')
plt.plot([5, 10, 15], rbf_err, label='RBF Fit', marker='*', markersize='10')
plt.xlabel('Order')
plt.ylabel('Error')
plt.xticks([5, 10, 15])
plt.title('Error vs Order')
plt.legend()
plt.show()


#### (d) Using a 10th-order polynomial, fit the data with the curvature regularizer in equation (14.47), and plot the fits for $\lambda = 0,0.01,0.1,1$.¶

Adding the curvature regularizer, we want to minimize $$I = \sum_{n=1}^{N}[y_n-f(x_n,\vec{a})]^2 + \lambda \int \Big[\frac{d^2f}{dx^2}\Big]^2dx$$

We have $$f(x,\vec{a}) = \sum_{m=0}^{10}a_mx^m$$ and $$\frac{d^2f}{dx^2} = \sum_{m=0}^{10}a_m m (m-1)x^{m-2}$$

Thus, $$I = \sum_{n=1}^{N}[y_n-\sum_{m=0}^{10}a_mx_n^m]^2 + \lambda \int_{x_-}^{x_+} \Big[\sum_{m=2}^{10}a_m m (m-1)x^{m-2}]^2dx$$

To minimize, we want the derivative with respect to the coefficient to be zero, $$\frac{\partial I}{\partial a_i}=0 \\ \frac{\partial}{\partial a_i}\sum_{n=1}^{N}[y_n-\sum_{m=0}^{10}a_ix_n^m]^2 + \frac{\partial}{\partial a_i}\lambda \int_{x_-}^{x_+} \Big[\sum_{m=2}^{10}a_i m (m-1)x^{m-2}]^2dx = 0 \\ -x_n^{i}\Big(\sum_{n=1}^{N}[y_n-\sum_{m=0}^{10}a_ix_n^m]\Big) -\lambda i(i-1)x^{i-2} \int_{x_-}^{x_+} \Big[\sum_{m=2}^{10}a_i m (m-1)x^{m-2}]^2dx = 0 \\ -\sum_{n=1}^{N}x_n^{i}y_n + \sum_{n=1}^{N}\sum_{m=0}^{10}a_ix_n^{m+i} - \lambda i(i-1)\sum_{m=2}^{10}a_i m (m-1)\int_{x_-}^{x_+}x^{m+i-4} = 0 \\ -\sum_{n=1}^{N}x_n^{i}y_n + \sum_{n=1}^{N}\sum_{m=0}^{10}a_ix_n^{m+i} - \lambda i(i-1)\sum_{m=2}^{10}a_i m (m-1)\frac{x_+^{m+i-3}-x_-^{m+i-3}}{m+i-3} = 0$$

Wow, that required extreme bookkeeping to do the partial with the integration and all the sums!!

Now we care about the coefficients, so we can solve for them $$a_i = -\sum_{n=1}^{N}x_n^{i}y_n + \sum_{n=1}^{N}\sum_{m=0}^{10}x_n^{m+i} - \lambda i(i-1)\sum_{m=2}^{10}m (m-1)\frac{x_+^{m+i-3}-x_-^{m+i-3}}{m+i-3}$$

Python implementation follows

In [9]:
def fit_poly_regularizer(x_train, y_train, x_valid, y_valid, x_vis, order, lam):

# Create Vandermonde matrix
A = np.zeros((order,))
for i in range(order):
A[i] = y_train @ (x_train/order)**i

B = np.zeros((order, order))
for i in range(order):
for j in range(order):
B[i,j] = np.sum((x_train/order)**(i+j))

C = np.zeros((order, order))
for i in range(2,order):
for j in range(2, order):
C[i,j] = lam * i * (i-1) * j * (j-1) * (1**(j+i-3) - (-1)**(j+i-3))/(j+i-3)

c = np.linalg.inv(B-C)@A

A = np.zeros((order, len(x_valid)))
for i in range(order):
A[i] = (x_valid/order)**i

error = np.mean(np.square(y_valid - A.T@c))

A = np.zeros((order, len(x_vis)))
for i in range(order):
A[i] = (x_vis/order)**i

y_vis = A.T@c

return error, y_vis

In [10]:
plt.figure()
plt.scatter(x_train, y_train, color='k', marker='x')
plt.xlim([-10.5, 10.5])
plt.ylim([-.5, 1.5])
for lam in [0, 0.01, 0.1, 1]:
error, y_vis = fit_poly_regularizer(x_train, y_train, x_valid, y_valid, x_vis, 10, lam)
plt.plot(x_vis, y_vis, label='Lambda %.2f, Error: %.3f' % (lam, error))
plt.legend()
plt.show()


### (14.3) Train a neural network on the output from an order 4 maximal LFSR (Problem 6.3) and learn to reproduce it. How do the results depend on the network depth and architecture?¶

In [11]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')
import keras
import keras.backend as K
from keras.models import Model
from keras.layers import Dense, Activation, Input
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

Using TensorFlow backend.

In [12]:
# Generate LFSR code
x_prev = [0, 0, 0, 1]
x_data = []
y_data = []
while (len(x_data)<10000):
x = (x_prev[0] + x_prev[3]) % 2
y_data.append(x)
x_data.append(x_prev)
x_prev.pop()
x_prev.insert(0,x)

In [13]:
# Prepare datasets
# Dataset for the feed forward
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=.2)
# Dataset for the recurrent
x_seq = y_data
# Dataset for the autoencoder
x_train_auto, x_valid_auto = train_test_split(x_data, test_size=.2)

In [14]:
def train_feed_forward(x_train, y_train, x_test, y_test, n_neurons=None, activation='sigmoid'):

inp = Input(batch_shape=(None, 4))
x = inp
for i in n_neurons:
x = Dense(i)(x)
x = Activation(activation)(x)

out = Dense(1,use_bias=False, activation=activation)(x)

model = Model(inp, out)

model.compile(loss='mse',
optimizer='sgd')

history = model.fit(x_train, y_train,
batch_size=100,
epochs=1000,
verbose=0,
validation_split=0,
callbacks=[EarlyStopping(patience=20, min_delta=.001, monitor='loss')])

y_pred = model.predict(x_test, batch_size=50, verbose=1)
error = model.evaluate(x_test, y_test, batch_size=50, verbose=1)

return y_pred, model.count_params(), error

In [15]:
layers = [1, 2, 3, 5]
neurons = [2, 5, 10, 20]
error = np.zeros((len(layers), len(neurons)))
params = np.zeros_like(error)
for i,l in enumerate(layers):
for j,n in enumerate(neurons):
y_pred, params[i,j], error[i,j] = train_feed_forward(x_train, y_train, x_valid, y_valid, n_neurons=[n for _ in range(l)], activation='sigmoid')

  50/2000 [..............................] - ETA: 4s
In [16]:
print('Plot of error (heatmap) and number of params (annotation) for layers vs neurons')
ax = sns.heatmap(error, xticklabels=layers, yticklabels=neurons, annot=params)
ax.set_xlabel('Number of Layers')
ax.set_ylabel('Number of Neurons per layer')
plt.show()

Plot of error (heatmap) and number of params (annotation) for layers vs neurons