Chapter 14 - Function Architectures¶

(14.1) Find the first five diagonal Padé approximants $[1/1],\dots,[5/5]$ to $e^x$ around the origin. Remember that the numerator and denominator can be multiplied by a constant to make the numbers as convenient as possible. Evaluate the approximations at $x=1$ and compare with the correct value of $e=2.718281828459045$. How is the error improving with the order?¶

Before we proceed to compute the approximants in Python, let's rewrite the equations (14.7) into its matrix form.

For an $[N/M]$ order polynomial, we have for $l = N+1,.\dots,N+M$ $$\begin{align} 0 &= c_l + \sum_{m=1}^{N}b_m c_{l-m} \\ \begin{bmatrix}0 \\ 0 \\ \vdots \\ 0 \end{bmatrix} &= \begin{bmatrix}c_{N+1} \\ c_{N+2} \\ \vdots \\ c_{N+M} \end{bmatrix} + \begin{bmatrix} c_N & c_{N-1} & \dots \\ \vdots & \ddots & \\ c_{N+1-M} & & c_{N} \end{bmatrix} \begin{bmatrix}b_1 \\ b_2 \\ \vdots \\ b_M \end{bmatrix} \\ \begin{bmatrix}b_1 \\ b_2 \\ \vdots \\ b_M \end{bmatrix} &= \begin{bmatrix} c_N & c_{N-1} & \dots \\ \vdots & \ddots & \\ c_{N+1-M} & & c_{N} \end{bmatrix}^{-1} \begin{bmatrix}c_{N+1} \\ c_{N+2} \\ \vdots \\ c_{N+M} \end{bmatrix} \end{align}$$

Also, the Taylor expansion for $e^x$ is $1 + \sum_{m=1}^{\infty}\frac{x^m}{m!}$.

# Finding Pade approximants
import numpy as np

def exp_pade_approximant(M, N, x=None):
    # Calculate Taylor expansion for exp up to N+M order
    c = 1. / np.insert(np.cumprod(range(1, M+N+1), dtype=np.float32),0,1)
    # Calculate b coefficients by doing the matrix multiplication described above
    C = np.zeros((M, M))
    for i, l in enumerate(range(N, N+M)):
        C[i] = c[l:l-M:-1]
#     print(C)
#     print(c[N+1:N+M+1])
    b = - np.linalg.inv(C) @ c[N+1:N+M+1]
    b = np.insert(b, 0, 1) # b_0 = 1
#     print(b)
    
    # Calculate a coefficients
    a = np.zeros_like(b)
    a[0] = c[0]
    for n in range(1, N+1):
        a[n] = c[n] + b[1:n+1]@np.flip(c[:n], 0)
    print(a)
    
    if x:
        x = [x**i for i in range(0, N+M+1)]
        y = a@x[:N+1] / (b@x[:M+1])
        err = np.abs(np.exp(x[1]) - y)
        
        # Scaling coefficients to have the constant term equal to 1
        scale = np.min(np.abs(a))
        return a/scale, b/scale, y, err
    else:
        return a, b

np.set_printoptions(precision=2)
for i in range(1,6):
    a, b, y, err = exp_pade_approximant(i, i, x=1.)
    print('For the [{0}/{0}] approximant'.format(i))
    print('-----------------------------')
    print('Error: %.2g' % err)
    print('a', a)
    print('b', b)

[ 1.   0.5]
For the [1/1] approximant
-----------------------------
Error: 0.28
a [ 2.  1.]
b [ 2. -1.]
[ 1.    0.5   0.08]
For the [2/2] approximant
-----------------------------
Error: 0.004
a [ 12.   6.   1.]
b [ 12.  -6.   1.]
[ 1.    0.5   0.1   0.01]
For the [3/3] approximant
-----------------------------
Error: 2.8e-05
a [ 120.   60.   12.    1.]
b [ 120.  -60.   12.   -1.]
[  1.00e+00   5.00e-01   1.07e-01   1.19e-02   5.95e-04]
For the [4/4] approximant
-----------------------------
Error: 1e-07
a [  1.68e+03   8.40e+02   1.80e+02   2.00e+01   1.00e+00]
b [  1.68e+03  -8.40e+02   1.80e+02  -2.00e+01   1.00e+00]
[  1.00e+00   5.00e-01   1.11e-01   1.39e-02   9.92e-04   3.31e-05]
For the [5/5] approximant
-----------------------------
Error: 7e-09
a [  3.02e+04   1.51e+04   3.36e+03   4.20e+02   3.00e+01   1.00e+00]
b [  3.02e+04  -1.51e+04   3.36e+03  -4.20e+02   3.00e+01  -9.99e-01]

(14.2) Take as a dataset $x = {-10, -9, \dots, 9, 10}$, and $y(x)=0$ if $x\leq 0$ and $y(x)=1$ if $x>0$.¶

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')

x_train = np.arange(-10,10,1)
y_train = (x_train>0).astype(np.float32)
x_valid = np.arange(-10.5,10.5,1)
y_valid = (x_valid>0).astype(np.float32)
x_vis = np.arange(-10.5, 10.5, .1)

(a) Fit the data with a polynomial with 5, 10 and 15 terms, using a pseudo-inverse of the Vandermonde matrix.¶

def fit_polynomial(x_train, y_train, x_valid, y_valid, x_vis, order):
    # Create Vandermonde matrix
    A = np.zeros((order, len(x_train)))
    for i in range(order):
        A[i] = x_train**i
    
    c = np.linalg.pinv(A.T)@y_train
    
    A = np.zeros((order, len(x_valid)))
    for i in range(order):
        A[i] = x_valid**i
    
    error = np.mean(np.square(y_valid - A.T@c))
    
    
    A = np.zeros((order, len(x_vis)))
    for i in range(order):
        A[i] = x_vis**i
    
    y_vis = A.T@c
    
    return error, y_vis

plt.figure()
plt.scatter(x_train, y_train, color='k', marker='x')
plt.xlim([-10.5, 10.5])
plt.ylim([-.5, 1.5])
for order in [5, 10, 15]:
    error, y_vis = fit_polynomial(x_train, y_train, x_valid, y_valid, x_vis, order)
    plt.plot(x_vis, y_vis, label='%dth Order, Error: %.3f' % (order, error))
plt.legend()
plt.show()

(b) Fit the data with 5, 10, and 15 $r^3$ RBFs uniformly distributed between $x=-10$ and $x=-10$.¶

def fit_rbf(x_train, y_train, x_valid, y_valid, x_vis, order):
    
    # Position RBFs
    c = np.arange(-10, 10, 20//order)
    
    # Create RBF matrix
    R = np.zeros((order, len(x_train)))
    for i in range(order):
        R[i] = np.abs(x_train - c[i])**3
    
    a = np.linalg.pinv(R.T)@y_train
    
    R = np.zeros((order, len(x_valid)))
    for i in range(order):
        R[i] = np.abs(x_valid - c[i])**3
    
    error = np.mean(np.square(y_valid - R.T@a))
    
    
    R = np.zeros((order, len(x_vis)))
    for i in range(order):
        R[i] = np.abs(x_vis - c[i])**3
    
    y_vis = R.T@a
    
    return error, y_vis

plt.figure()
plt.scatter(x_train, y_train, color='k', marker='x')
plt.xlim([-10.5, 10.5])
plt.ylim([-.5, 1.5])
for order in [5, 10, 15]:
    error, y_vis = fit_rbf(x_train, y_train, x_valid, y_valid, x_vis, order)
    plt.plot(x_vis, y_vis, label='%dth Order, Error: %.3f' % (order, error))
plt.legend()
plt.show()

(c) Using the coefficients found for these six fits, evaluate the total out-of-sample error at $x={-10.5, -9.5, \dots, 9.5, 10.5}$.¶

plt.figure()
polynomial_err = [fit_polynomial(x_train, y_train, x_valid, y_valid, x_vis, order)[0] for order in [5, 10, 15]]
rbf_err = [fit_rbf(x_train, y_train, x_valid, y_valid, x_vis, order)[0] for order in [5, 10, 15]]
plt.plot([5, 10, 15], polynomial_err, label='Polynomial Fit', marker='o')
plt.plot([5, 10, 15], rbf_err, label='RBF Fit', marker='*', markersize='10')
plt.xlabel('Order')
plt.ylabel('Error')
plt.xticks([5, 10, 15])
plt.title('Error vs Order')
plt.legend()
plt.show()

(d) Using a 10th-order polynomial, fit the data with the curvature regularizer in equation (14.47), and plot the fits for $\lambda = 0,0.01,0.1,1$.¶

Adding the curvature regularizer, we want to minimize $$I = \sum_{n=1}^{N}[y_n-f(x_n,\vec{a})]^2 + \lambda \int \Big[\frac{d^2f}{dx^2}\Big]^2dx$$

We have $$f(x,\vec{a}) = \sum_{m=0}^{10}a_mx^m$$ and $$\frac{d^2f}{dx^2} = \sum_{m=0}^{10}a_m m (m-1)x^{m-2}$$

Thus, $$I = \sum_{n=1}^{N}[y_n-\sum_{m=0}^{10}a_mx_n^m]^2 + \lambda \int_{x_-}^{x_+} \Big[\sum_{m=2}^{10}a_m m (m-1)x^{m-2}]^2dx$$

To minimize, we want the derivative with respect to the coefficient to be zero, $$\frac{\partial I}{\partial a_i}=0 \\ \frac{\partial}{\partial a_i}\sum_{n=1}^{N}[y_n-\sum_{m=0}^{10}a_ix_n^m]^2 + \frac{\partial}{\partial a_i}\lambda \int_{x_-}^{x_+} \Big[\sum_{m=2}^{10}a_i m (m-1)x^{m-2}]^2dx = 0 \\ -x_n^{i}\Big(\sum_{n=1}^{N}[y_n-\sum_{m=0}^{10}a_ix_n^m]\Big) -\lambda i(i-1)x^{i-2} \int_{x_-}^{x_+} \Big[\sum_{m=2}^{10}a_i m (m-1)x^{m-2}]^2dx = 0 \\ -\sum_{n=1}^{N}x_n^{i}y_n + \sum_{n=1}^{N}\sum_{m=0}^{10}a_ix_n^{m+i} - \lambda i(i-1)\sum_{m=2}^{10}a_i m (m-1)\int_{x_-}^{x_+}x^{m+i-4} = 0 \\ -\sum_{n=1}^{N}x_n^{i}y_n + \sum_{n=1}^{N}\sum_{m=0}^{10}a_ix_n^{m+i} - \lambda i(i-1)\sum_{m=2}^{10}a_i m (m-1)\frac{x_+^{m+i-3}-x_-^{m+i-3}}{m+i-3} = 0 $$

Wow, that required extreme bookkeeping to do the partial with the integration and all the sums!!

Now we care about the coefficients, so we can solve for them $$a_i = -\sum_{n=1}^{N}x_n^{i}y_n + \sum_{n=1}^{N}\sum_{m=0}^{10}x_n^{m+i} - \lambda i(i-1)\sum_{m=2}^{10}m (m-1)\frac{x_+^{m+i-3}-x_-^{m+i-3}}{m+i-3}$$

Python implementation follows

def fit_poly_regularizer(x_train, y_train, x_valid, y_valid, x_vis, order, lam):
    
    # Create Vandermonde matrix
    A = np.zeros((order,))
    for i in range(order):
        A[i] = y_train @ (x_train/order)**i
    
    B = np.zeros((order, order))
    for i in range(order):
        for j in range(order):
            B[i,j] = np.sum((x_train/order)**(i+j))
    
    C = np.zeros((order, order))
    for i in range(2,order):
        for j in range(2, order):
            C[i,j] = lam * i * (i-1) * j * (j-1) * (1**(j+i-3) - (-1)**(j+i-3))/(j+i-3)
    
    c = np.linalg.inv(B-C)@A
    
    A = np.zeros((order, len(x_valid)))
    for i in range(order):
        A[i] = (x_valid/order)**i
    
    error = np.mean(np.square(y_valid - A.T@c))
    
    A = np.zeros((order, len(x_vis)))
    for i in range(order):
        A[i] = (x_vis/order)**i
    
    y_vis = A.T@c
    
    return error, y_vis

plt.figure()
plt.scatter(x_train, y_train, color='k', marker='x')
plt.xlim([-10.5, 10.5])
plt.ylim([-.5, 1.5])
for lam in [0, 0.01, 0.1, 1]:
    error, y_vis = fit_poly_regularizer(x_train, y_train, x_valid, y_valid, x_vis, 10, lam)
    plt.plot(x_vis, y_vis, label='Lambda %.2f, Error: %.3f' % (lam, error))
plt.legend()
plt.show()

(14.3) Train a neural network on the output from an order 4 maximal LFSR (Problem 6.3) and learn to reproduce it. How do the results depend on the network depth and architecture?¶

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.set_context('notebook')
import keras
import keras.backend as K
from keras.models import Model
from keras.layers import Dense, Activation, Input
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

Using TensorFlow backend.

# Generate LFSR code
x_prev = [0, 0, 0, 1]
x_data = []
y_data = []
while (len(x_data)<10000):
    x = (x_prev[0] + x_prev[3]) % 2
    y_data.append(x)
    x_data.append(x_prev)
    x_prev.pop()
    x_prev.insert(0,x)

# Prepare datasets
# Dataset for the feed forward
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=.2)
# Dataset for the recurrent
x_seq = y_data
# Dataset for the autoencoder
x_train_auto, x_valid_auto = train_test_split(x_data, test_size=.2)

def train_feed_forward(x_train, y_train, x_test, y_test, n_neurons=None, activation='sigmoid'):
    
    inp = Input(batch_shape=(None, 4))
    x = inp
    for i in n_neurons:
        x = Dense(i)(x)
        x = Activation(activation)(x)
        
    out = Dense(1,use_bias=False, activation=activation)(x)
    
    model = Model(inp, out)
    
    model.compile(loss='mse',
                  optimizer='sgd')
    
    history = model.fit(x_train, y_train,
                    batch_size=100,
                    epochs=1000,
                    verbose=0,
                    validation_split=0,
                    callbacks=[EarlyStopping(patience=20, min_delta=.001, monitor='loss')])
    
    y_pred = model.predict(x_test, batch_size=50, verbose=1)
    error = model.evaluate(x_test, y_test, batch_size=50, verbose=1)
        
    return y_pred, model.count_params(), error

layers = [1, 2, 3, 5]
neurons = [2, 5, 10, 20]
error = np.zeros((len(layers), len(neurons)))
params = np.zeros_like(error)
for i,l in enumerate(layers):
    for j,n in enumerate(neurons):
        y_pred, params[i,j], error[i,j] = train_feed_forward(x_train, y_train, x_valid, y_valid, n_neurons=[n for _ in range(l)], activation='sigmoid')

  50/2000 [..............................] - ETA: 4s

print('Plot of error (heatmap) and number of params (annotation) for layers vs neurons')
ax = sns.heatmap(error, xticklabels=layers, yticklabels=neurons, annot=params)
ax.set_xlabel('Number of Layers')
ax.set_ylabel('Number of Neurons per layer')
plt.show()

Plot of error (heatmap) and number of params (annotation) for layers vs neurons