eth-summaries/semester6/iml/parts/04_networks.tex

\textbf{Motivation}: So far, when looking for $\hat{f}(x)$ the form was $\hat{f}(x)=w^\top x$, or $\hat{f}(x) = w^\top \phi(x)$.
Note how the features $x, \phi(x)$ are predetermined. Why not learn them?

\textbf{New Optimization Problem}:

The new join-optimization problem, for $w$ and $\phi$:\\
\subtext{$\Theta$ is a set of parameters for $\phi$}
$$
    \hat{w} = \underset{w\in\R^m,\Theta\in\R^{m\times d}}{\text{arg min}}\Biggl( \frac{1}{n}\sum_{i=1}^n l\Bigl( w^\top \phi(x_i;\Theta),y_i \Bigr) \Biggr)
$$
Where $\phi(x,\Theta) = \Bigl( \phi_1(x;\theta_1),\ldots,\phi_m(x;\theta_m) \Bigr)$.\\
\subtext{$\theta_i$ is the $i$th row of $\Theta$, i.e. $\theta_i := (\Theta)_{i,:}$}

More compact, in terms of $\Theta$, which combines $w, \phi$:
$$
    \Theta^* := \underset{\Theta}{\text{arg min}}\Bigl( L(\Theta; \mathcal{D}) \Bigr) = \underset{\Theta}{\text{arg min}} \Biggl( \frac{1}{n} \sum_{i=1}^{n} l\Bigl( \Theta;x_i,y_i \Bigr) \Biggr)
$$
\subtext{$\Theta$ may also encapsulate $w,\phi$ for multiple layers, depending on definition}

\subsection{Definitions}

\definition \textbf{Activation Function}\\
We set $\phi_i(x;\theta_i) = \psi(\theta_i^\top x)$, $\psi$ is the activation function.\\
\subtext{$\theta_i \in \R^d,\quad\psi:\R\to\R$}

{\scriptsize
    \notation More concisely, $\phi(x;\Theta) = \psi(\Theta x)$
}

\begin{center}
    \begin{tabular}{ll}
        \textbf{Activation Function}    & \textbf{Definition}               \\
        \hline
        Identity                        & $\psi(z) = z$                     \\
        Sigmoid                         & $\psi(z) = \frac{1}{1+e^{-z}}$    \\
        Hyperbolic tangent              & $\psi(z) = \tanh(z)$              \\
        Rectified Linear Unit (ReLU)    & $\psi(z) = \max(0,z)$
    \end{tabular}
\end{center}

\definition \textbf{Artificial Neural Network}\\
\subtext{The output functions of the above problem take the form:}
$$
    f(x;w,\theta) = \sum_{j=1}^{m}w_j\psi(\theta_j^\top x)
$$
{\scriptsize
    \remark Also called Multi-Layer Perceptron (MLP)
}

\newpage
\textbf{What is happening here?}\\
\smalltext{Explaining the calculation steps for such an $f$ naturally leads to the common pictorial depiction of neural networks.}
\begin{align*}
    \text{(i)}      &\quad x       &=\quad& (x_1,\ldots,x_n) \in \R^d       & \text{(Input Vector)}             \\
    \text{(ii)}     &\quad z       &=\quad& \Theta x                        & \text{(Linear transformation)}    \\
    \text{(iii)}    &\quad h_i     &=\quad& \psi(z_i)                       & \text{(Activation function)}      \\
    \text{(iv)}     &\quad f(x)    &=\quad& \sum_{j=1}^m w_j h_j            & \text{(Output)}
\end{align*}

\definition \textbf{Hidden Layer} $h = \psi(z)$

\definition \textbf{Bias Term} $b \in \R^m$\\
\subtext{Needed, as $f$ might not pass through origin. Similar to using $F_\text{lin}$ in regression, these can also be added by augmenting the input \& hidden layers.}

\textbf{Does this work at all?}\\
\smalltext{Yes, for most functions this does work.}

\definition \textbf{Sigmoidal Function}
$$
    \sigma(t) \text{ s.t. } \begin{cases}
        \sigma: \R \to \R \\
        \underset{t\to\infty}{\lim} = 1 \text{ and } \underset{t\to\infty}{\lim}
    \end{cases}
$$

\theorem \textbf{Universal Approximation Theorem}\\
\smalltext{$\hat{f}$, that uniformly approximates $f$, exists and takes this form:}
$$
    \hat{f}(x) = \textbf{W}^{(2)}\psi\Bigl( \textbf{W}^{(1)}x + b \Bigr)
$$
\smalltext{$f: [0,1]^d\to\R$ continuous$,\quad \psi $ sigmoidal}\\
\subtext{$\textbf{W}^{(1)} \in\R^{m\times d},\quad \textbf{W}^{(2)}\in\R^{1\times m},\quad m \in \N$}

Note how $m$ could be very large.\\
\subtext{$m$ can intuitively be understood as the "width" of the ANN}

\newpage
\definition \textbf{Fully Connected Neural Network}

More complex ANNs might have:
\begin{enumerate}
    \item More hidden layers
    \item Multiple outputs
    \item Differen activation functions across layers
\end{enumerate}
These are called \textit{fully connected}, since every node in a layer is connected to every node in the adjacent layers.\\
\subtext{There are also more complex architectures.}
\begin{center}
    \includegraphics[width=0.9\linewidth]{resources/FCANN.png}\\
    \subtext{\textit{Introduction to Machine Learning (2026), p. 183}}
\end{center}

\notation Weights: $\textbf{W}^{(i)} := \Bigl[ w_{k,l}^{(i)} \Bigr]$, Biases: $b^{(i)}_k$\\
and $\Theta = \Bigl(\textbf{W}^{(1)},\ldots,\textbf{W}^{(L)}, b^{(1)},\ldots,b^{(L)}\Bigr)$ (All parameters)\\
\subtext{$w_{k,l}^{(i)}$: "Weight at layer $i$ to node $k$ from node $l$"}

\newpage
\subsection{Forward Propagation}


How can we make predictions, i.e. how can $\hat{f}$ be evaluated?

\definition \textbf{Forward Propagation}\\
\subtext{This is just the computation for $1$-layer ANN generalized for $L$ layers}


\begin{algorithm}
    \caption{Forward Propagation}
    $h^{(0)}\gets x$\;
    \For{$l=1,\ldots,L$}{
        $z^{(l)} = \textbf{W}^{(l)}h^{(l-1)} + b^{(l)}$ \\
        $h^{(l)} = \psi(z^{(l)})$
    }
    $f \gets \textbf{W}^{(L)}h^{(L-1)}+b^{(L)}$ \\
    \Return f
\end{algorithm}

\subsection{Backwards Propagation}

How can we get all gradients needed for model training?

\definition \textbf{Backwards Propagation}

\textbf{Intuition}: An efficient way to get the gradients is to reuse results from forward prop. and previous steps. This works best when starting at the back, at $\nabla_{\textbf{W}^{(L)}}l$.

\textbf{Goal}: $\nabla_{\textbf{W}^{(1)}}l,\ldots,\nabla_{\textbf{W}^{(L)}} l, \nabla_{b^{(1)}}l,\ldots,\nabla_{b^{(L)}}l$

\textbf{Step 1}: Calculate $\nabla_{\textbf{W}^{(L)}}l$, i.e. start from the back.
\begin{align*}
    \nabla_{\textbf{W}^{(L)}}l  &= \frac{\partial l}{\partial \textbf{W}^{L}} \\
                                &= \frac{\partial l}{\partial f}\cdot\frac{\partial f}{\partial \textbf{W}^{(L)}} & \text{(Chain Rule)} \\
                                &= \frac{\partial l}{\partial f}\cdot\begin{bmatrix}
                                    \bigl( h^{(L-1)} \bigr)^\top    \\
                                    \vdots                          \\
                                    \bigl( h^{(L-1)} \bigr)^\top
                                \end{bmatrix}                                                                     & (f = \textbf{W}^{(L)}h^{(L-1)} + b^{(L)}) \\
                                &= \nabla_f l \cdot\begin{bmatrix}
                                    \bigl( h^{(L-1)} \bigr)^\top    \\
                                    \vdots                          \\
                                    \bigl( h^{(L-1)} \bigr)^\top
                                \end{bmatrix}                                                                     & \Biggl(\frac{\partial l}{\partial f} = \nabla_f l\Biggr)
\end{align*}
Notice how $h^{(L-1)}$ was computed during forward prop.

\newpage

\textbf{Step 2}: Calculate $\nabla_{\textbf{W}^{(L-1)}}l$.
\begin{align*}
    \nabla_{\textbf{W}^{(L-1)}}l    &= \underbrace{\frac{\partial l}{\partial f}}_{\text{(1)}}\cdot\underbrace{\frac{\partial f}{\partial h^{(L-1)}}}_{\text{(2)}}\cdot\underbrace{\frac{\partial h^{(L-1)}}{\partial z^{(L-1)}}}_\text{(3)}\cdot\underbrace{\frac{z^{(L-1)}}{\partial \textbf{W}^{(L-1)}}}_\text{(4)}      & \text{(Chain Rule)} \\
\end{align*}
\begin{enumerate}
    \item Already done in Step 1.
    \item Already done in forward propagation, equal to $\textbf{W}^{(L)}$:
    $$
        f \overset{\text{def}}{=} \textbf{W}^{(L)}h^{(L-1)}+b^{(L)} \implies \frac{\partial f}{\partial h^{(L-1)}} = \textbf{W}^{(L)}
    $$
    \item \textbf{Not done.} Needs to be calculated:
    \begin{align*}
        \frac{\partial h^{(L-1)}}{\partial z^{(L-1)}}   &= \frac{\partial \psi\bigl( z^{(L-1)} \bigr)}{\partial z^{(L-1)}}  \\
                                                        &= \text{diag}\Bigl( \psi'\bigl( z^{(L-1)} \bigr) \Bigr)            \\
                                                        &= \begin{bmatrix}
                                                            \psi'\Bigl(z_1^{(L-1)}\Bigr) & 0 & \cdots & 0 \\
                                                            0 & \psi'\Bigl(z_2^{(L-1)}\Bigr) & \cdots & 0 \\
                                                            \vdots & \vdots & \ddots & \vdots             \\
                                                            0 & 0 & \cdots & \psi\Bigl(z_n^{(L-1)}\Bigr)  \\
                                                        \end{bmatrix}
    \end{align*}
    \item Already done in forward propagation, analogous to step 1.
    $$
        \frac{\partial z^{(L-1)}}{\partial \textbf{W}^{(L-1)}} =
        \begin{bmatrix}
            \bigl( h^{(L-2)} \bigr)^\top    \\
            \vdots                          \\
            \bigl( h^{(L-2)} \bigr)^\top
        \end{bmatrix}
    $$
\end{enumerate}

\textbf{Step $i \leq L$}: Calculate $\nabla_{\textbf{W}^{(L-i)}}l$ Analogoues to step 2.\\
\subtext{The biases $\nabla_{b^{(l)}}l$ are analogous.}

\newpage

\subsection{Optimization}

\textbf{Problem}: How can we train the model, i.e find $\Theta^*$?
$$
    \Theta^* := \underset{\Theta}{\text{arg min}}\Bigl( L(\Theta; \mathcal{D}) \Bigr) = \underset{\Theta}{\text{arg min}} \Biggl( \frac{1}{n} \sum_{i=1}^{n} l\Bigl( \Theta;x_i,y_i \Bigr) \Biggr)
$$

{\footnotesize
    \remark $L(\Theta;\mathcal{D})$ is generally not convex.\\
    {\color{gray}
        i.e. local minima, saddle points may exist
    }

    \remark $\dim(\Theta)$ is the total param. count of NN, may be very large
}

\textbf{Solution}: Gradient Descent (with optimizations)
\begin{itemize}
    \item Stochastic Gradient Descent\\
    \subtext{(Why? $\dim(\Theta)$ is very large, $\nabla_\Theta l(\Theta;x_i,y_i)$ are expensive)}
    \item Minibatch Gradient Descent\\
    \subtext{(Why? $\mathcal{D}$ may be very large, so there are \textit{many} gradients)}
\end{itemize}

The standard GD update for $\Theta$ is:
$$
    \Theta^{t+1} = \Theta^t - \eta_t\cdot\nabla_\Theta L\Bigl( \Theta;\mathcal{D} \Bigr)
$$
In Minibatch GD, this becomes:\\
\subtext{Where $\mathcal{S} \subset \{1,\ldots,n\}$}
$$
    \Theta^{t+1} = \Theta^t - \eta_t\cdot\nabla_{\Theta^t} \Biggl( \frac{1}{|\mathcal{S}|}\sum_{i\in \mathcal{S}} l\Bigl( \Theta^t; x_i,y_i \Bigr) \Biggr)
$$

{\footnotesize
    \remark An advantage: If $\Theta^t$ approaches a stat. point (which isn't the global minimum), GD will converge, but MB-GD may not converge.
}

The further subsections go into more details:
\begin{enumerate}
    \item Preventing vanishing \& exploding Gradients
    \item Choice of initial $w_i$
    \item Choice of $\mu_t$
\end{enumerate}

\newpage

\subsubsection{Vanishing \& Exploding Gradients}
$$
    \nabla_{\Theta^t} \Biggl( \frac{1}{|\mathcal{S}|}\sum_{i\in \mathcal{S}} l\Bigl( \Theta^t; x_i,y_i \Bigr) \Biggr) = \frac{1}{|\mathcal{S}|}\sum_{i \in \mathcal{S}}\Bigl( \nabla_{\Theta^t} l(\Theta^t;x_i,y_i) \Bigr)
$$
The terms $\nabla_{\Theta^t} l\Bigl(\Theta^t;x_i,y_i\Bigr)$ each contain $\nabla_{\textbf{W}^{(l)}}l\Bigl(\textbf{W}^{(l)};x_i,y_i\Bigr)$.

\textbf{Problem}: Optimization might fail if:
$$
    \Bigl\Vert \nabla_{\textbf{W}^{(l)}}l \Bigr\Vert \to \infty \qquad\text{or}\qquad  \Bigl\Vert \nabla_{\textbf{W}^{(l)}}l \Bigr\Vert \to 0
$$

\textbf{Solution}: $\Vert \nabla_{\textbf{W}^{(l)}}l \Vert$ depends linearly on $\text{diag}\Bigl( \psi'(z^{(l)}) \Bigr)$, so the choice of $\psi$ ($\psi'$) can be used to constrain $\Vert \nabla_{\textbf{W}^{(l)}}l \Vert$\\
\subtext{Generally, the gradient follows the behaviour of $\psi'$}

% Script contains examples for this on Sigmoid & ReLU. Generally, the properties we need are visible by inspection of the derivatives graph.

Which features do we want $\psi$ ($\psi'$) to fullfil?
\begin{itemize}
    \item $\psi'$ should be fast to calculate
    \item $\psi'$ should be non-zero (and not get too close)
\end{itemize}

\remark The $\Bigl\Vert \nabla_{\textbf{W}^{(l)}}l \Bigr\Vert$ may still vanish for any $\psi$.

\subsubsection{Random Weight Initialization}
\begin{align*}
    \nabla_{\textbf{W}^{(l)}}l  &= \frac{\partial l}{\partial f}\cdot\frac{\partial f}{\partial h^{(l)}}\cdot\frac{\partial h^{(l)}}{\partial z^{(l)}}\cdot\frac{\partial z^{(l)}}{\partial \textbf{W}^{(l)}} \\
                                &= \frac{\partial l}{\partial f}\cdot\frac{\partial f}{\partial h^{(l)}}\cdot\text{diag}\Bigl( \psi'(z^{(l)}) \Bigr)\cdot\begin{bmatrix}
                                    (h^{(l-1)})^\top  \\
                                    \vdots          \\
                                    (h^{(l-1)})^\top
                                \end{bmatrix}
\end{align*}
So, the gradient $\Vert \nabla_{\textbf{W}^{(l)}}l \Vert$ also depends on $\Vert h^{(l-1)} \Vert$.\\
\subtext{For $l \in \{1,\ldots,L\}$}


\textbf{Problem}: It might be that $\Vert h^{(l-1)} \Vert \to \infty$ or $\Vert h^{(l-1)} \Vert \to 0$.

\textbf{Solution}: Set $h^{(l-1)}$ randomly, bound mean $\mu$ and var. $\sigma^2$.\\
\subtext{There is no generally optimal bound for $\sigma^2$, it depends on the NN.}

\newpage

\remark Practical distributions for common $\psi$:\\
\subtext{$n_\text{out},n_\text{out}$ are the node counts of the layers adjacent to $w_i$}

% Table in script
\begin{center}
    \begin{tabular}{l|l}
        $\psi$      & \textbf{Weights} \\
        \hline
        $\tanh$     & $w_i \sim \mathcal{N}\Bigl( 0, \frac{1}{n_{\text{in}}}            \Bigr)$ \\
        $\tanh$     & $w_i \sim \mathcal{N}\Bigl( 0, \frac{2}{n_\text{in}+n_\text{out}} \Bigr)$ \\
        \text{ReLu} & $w_i \sim \mathcal{N}\Bigl( 0, \frac{2}{n_\text{in}}              \Bigr)$
    \end{tabular}
\end{center}

\subsubsection{Learning Rate \& Weight Updates}
$$
    \Theta^{t+1} = \Theta^{t} - \mu_t \cdot \nabla_{\Theta^t}\Biggl( \frac{1}{|\mathcal{S}|}\sum_{i\in\mathcal{S}}l\Bigl( \Theta^t; x_i,y_i \Bigr) \Biggr)
$$
\textbf{Problem}: How to choose $\mu$? (Learning Rate)

\textbf{Solution}: Heuristics.\\
\subtext{There is no generally optimal $\mu$.}

\method \textbf{Piecewise constant $\mu_t$}

Intuitively, it makes sense to reduce $\mu_t$ as optimization progresses, as the algorithm approaches the minimum.\\
\subtext{Linear/cosine decay could also be used.}
$$
    \mu_t = \begin{cases}
        1       & 0 \geq t < 3 \\
        0.5     & 3 \leq t < 6 \\
        0.25    & 6 \leq t < 9 \\
        \ldots
    \end{cases}
$$

\method \textbf{Weight update indicator}

In practice, SGD often oscillates in finding the minimum. Then, a monotonic $\mu_t$ doesn't make sense.
$$
    \frac{\Big| \nabla_{\Theta^t} L( \Theta^t;\mathcal{D} ) \Big|}{\Vert \Theta^t \Vert}
$$
In general, if this indicator ratio is small, a higher learning rate makes sense (and vice versa).\\
\subtext{Intuitively: How strongly is the weight change, relative to weight size.}

\method \textbf{Momentum}

Combine the update direction with the previous update directions, for some weight $m > 0$, to stabilize.

\newpage

\subsection{Regularization}

\textbf{Problem}: How can overfitting be avoided?

A few methods can be applied directly to SGD:

\method \textbf{Penalty Term}

Similar to Ridge/LASSO Regression, a penalty term can be used, with some weight $\lambda > 0$.
$$
    \underset{\Theta \in \R^d}{\text{arg min}}\Bigl( L(\Theta;\mathcal{D}) \Bigr) \quad \to \quad \underset{\Theta \in \R^d}{\text{arg min}}\Bigl( L(\Theta;\mathcal{D}) + \lambda \Vert\Theta\Vert^2 \Bigr)
$$

\method \textbf{Earlier stop}

Choosing a different stop criterion for SGD, e.g. performance on the test set $\mathcal{D}'$.

\remark \textbf{Validation \& Training Error}

Overfitting occurs when the training error (on $\mathcal{D}$) continues to fall, but the test error increases (on $\mathcal{D}'$).

\begin{center}
    \includegraphics[width=0.7\linewidth]{resources/ValidationTrainingErrors.png}\\
    \color{gray}\footnotesize
    \textit{Introduction to Machine Learning (2026), p. 196}
\end{center}

\newpage
\subsubsection{Dropout Regularization}

This is a method specific to Neural Networks.

\method \textbf{Dropout}

Fix some $p \in (0,1)$. For each SGD iteration in training:\\
\textit{Drop out} each hidden unit with probability $1-p$, and skip their optimization for this iteration.
\begin{align*}
    z_j^{(l)} &= \sum_{i=0}^{n_{i-1}}\Bigl( w_{j,i}^{(l)}h_i^{(l-1)} \Bigr) + b_j^{(l)}                         & \text{(Regular Neuron)}       \\
    z_j^{(l)} &= \sum_{i=0}^{n_{i-1}}\Bigl( w_{j,i}^{(l)}h_i^{(l-1)} \cdot \mathbb{I}_{C_i} \Bigr) + b_j^{(l)}  & \text{(With Dropout)}
\end{align*}
\subtext{Where $C_i:=\{\text{"Unit } h_i^{(l-1)} \text{ is kept this iter."}\}$, so $\P[C_i] = p$}

\textbf{Problem}: For $\mathcal{D}'$, we again want to use all layers.

\textbf{Solution}: Scale all weights with $p$

% This makes sense but I don't get why exactly the scaling is needed

For this, we use $\E\Bigl[ z_j^{(l)} \Bigr]$ instead of $z_j^{(l)}$.
\begin{align*}
    \E\Bigl[ z_j^{(l)} \Bigr] = \Bigl( p\cdot w_j^{(l)} \Bigr)^\top \cdot h^{(l-1)} + b_j^{(l)}
\end{align*}
\subtext{By using $\E\bigl[ \mathbb{I}_{C_i} \bigr] = \P[C_i] = p$}

\subsubsection{Batch Normalization}

During SGD, the weight's $\sigma^2$ may again explode.\\
\subtext{After few iterations, $w_i$ may have changed completely from init.}

\textbf{Problem}: Internal covariate shift.\\
\subtext{The mean $\mu$ deviates from $0$ and $\sigma^2$ might increase}

\textbf{Solution}: Standardize $\psi$ also during training.

\newpage

\method \textbf{Batch Normalization}

% The script doesn't specify how \alpha is set, unfortunately

In pratice, only batches of $\psi$ are normalized. The core idea is to set $\mu \mapsto 0$ and $\sigma^2 \mapsto 1$ within the batch.\\
\subtext{This isn't optimal for all problems, and can be tweaked using $\beta,\gamma$}

The algorithm uses the parameters $\beta, \gamma$ and buffers $\mu_\text{EMA}, \sigma^2_\text{EMA}$.\\
\subtext{$\beta,\gamma$ are learnable and can also be optimized}

\textbf{Normalization Step} (Training set)\\
\smalltext{For a minibatch $\mathcal{S} = \{i_1,\ldots,i_k\}$, the batch is $\{x_{i_1},\ldots,x_{i_k}\}$.}

\textbf{Step 1}: Find current values of $\mu, \sigma^2$.
\begin{align*}
    \mu_\mathcal{S}         &:= \frac{1}{|\mathcal{S}|}\sum_{j\in\mathcal{S}}x_j                                    & \text{\color{gray}\footnotesize(minibatch mean)}       \\
    \sigma^2_\mathcal{S}    &:= \frac{1}{|\mathcal{S}|}\sum_{j\in\mathcal{S}}\Bigl( x_i-\mu_\mathcal{S} \Bigr)^2    & \text{\color{gray}\footnotesize(minibatch variance)}
\end{align*}
\textbf{Step 2}: Update the moving average: $\mu_\text{EMA},\sigma^2_\text{EMA}$.
\begin{align*}
    \mu_\text{EMA}          &= (1-\alpha)\mu_\text{EMA} + \alpha \cdot \mu_\mathcal{S}                              & \text{\color{gray}\footnotesize(avg. mean update)}     \\
    \sigma^2_\text{EMA}     &= (1-\alpha)\sigma^2_\text{EMA}+\alpha\cdot \sigma_\mathcal{S}^2                       & \text{\color{gray}\footnotesize(avg. variance update)}
\end{align*}
\textbf{Step 3}: Update the $x_j$.
\begin{align*}
    \hat{x}_j               &= \frac{x_j-\mu_\mathcal{S}}{\sqrt{\sigma^2_\mathcal{S} + \epsilon}}                   & \text{\color{gray}\footnotesize(point normalization)}  \\
    \bar{x}_j               &= \gamma\cdot\hat{x}_j + \beta                                                         & \text{\color{gray}\footnotesize(scale \& shift)}
\end{align*}

\textbf{Normalization Step} (Test set)\\
\smalltext{Only apply step 3, now using the moving average values.}
\begin{align*}
    \hat{x}_j               &= \frac{x_j-\mu_\text{EMA}}{\sqrt{\sigma^2_\text{EMA}+\epsilon}}                       & \text{\color{gray}\footnotesize(point normalization)}  \\
    \bar{x}_j               &= \gamma\cdot\hat{x}_j + \beta                                                         & \text{\color{gray}\footnotesize(scale \& shift)}
\end{align*}

\newpage

\subsection{Convolutional Neural Networks}

In fully connected NNs:
$$
    h^{(l)} = \psi\Bigl( \textbf{W}^{(l)} h^{(l-1)} \Bigr)
$$
Each unit of layer $l-1$ affects each unit of layer $l$.\\
In CNNs, this is relaxed: not all nodes (must) interact.

\definition \textbf{Convolutional Neural Network} (CNN)\\
Layers are connected via convolutions.
$$
    h^{(l)} = \psi\Bigl( w^{(l)} * h^{(l-1)} \Bigr)
$$

{\footnotesize
    \remark In CNNs, the weights are also called \textit{filters}.
}

\definition \textbf{Convolution} {\footnotesize (Discrete, 2D) } \\
\subtext{$w \in \R^k,\quad x \in \R^d$}
$$
    w * x := \sum_{j=\max\{1,i-d+1\}}^{\min\{i,k\}}\Biggl( w_j\cdot x_{i-j+1} \Biggr)
$$
Understanding this is easier by example:

\smalltext{\textbf{Example}: $w = (w_1,w_2)^\top,\quad x=(x_1,x_2,x_3)^\top$}
{\footnotesize
$$
    w*x = \begin{bmatrix}
        w_1 \cdot x_1           + \color{gray}w_2 \cdot 0     \\
        w_1 \cdot x_2           + w_2 \cdot x_1               \\
        w_1 \cdot x_3           + w_2 \cdot x_2               \\
        \color{gray}w_1 \cdot 0 + \color{black}w_2 \cdot x_3
    \end{bmatrix}
$$
}

\smalltext{\textbf{Example}: a CNN with $3$ inputs and $1$ hidden layer:}
{\footnotesize
$$
    \begin{bmatrix}
        z_1 \\
        z_2 \\
        z_3 \\
        z_4
    \end{bmatrix} = \underbrace{\begin{bmatrix}
        w_1 & 0     & 0     \\
        w_2 & w_1   & 0     \\
        0   & w_2   & w_1   \\
        0   & 0     & w_2   \\
    \end{bmatrix}}_{\textbf{W}^{(1)} \text{ in CNN}} \cdot \begin{bmatrix}
        x_1 \\
        x_2 \\
        x_3
    \end{bmatrix} = w * x
$$
}

\subsubsection{Multidimensional Convolution}
\textbf{TODO} add explanation

% The script has a very good intutive walkthrough of how this works.