[IML] NN done, k-means

2026-04-29 04:39:23 +02:00 · 2026-04-28 16:19:25 +02:00
parent b01098cce3
commit e18bd73fed
5 changed files with 329 additions and 5 deletions
@@ -26,4 +26,8 @@
 \section{Neural Networks}
 \input{parts/04_networks.tex}

+\newpage
+\section{Unsupervised Learning}
+\input{parts/05_unsupervised.tex}
+
 \end{document}
@@ -229,19 +229,26 @@ $$
    \remark An advantage: If $\Theta^t$ approaches a stat. point (which isn't the global minimum), GD will converge, but MB-GD may not converge.
 }

+The further subsections go into more details:
+\begin{enumerate}
+    \item Preventing vanishing \& exploding Gradients
+    \item Choice of initial $w_i$
+    \item Choice of $\mu_t$
+\end{enumerate}
+
+\newpage
+
 \subsubsection{Vanishing \& Exploding Gradients}
 $$
    \nabla_{\Theta^t} \Biggl( \frac{1}{|\mathcal{S}|}\sum_{i\in \mathcal{S}} l\Bigl( \Theta^t; x_i,y_i \Bigr) \Biggr) = \frac{1}{|\mathcal{S}|}\sum_{i \in \mathcal{S}}\Bigl( \nabla_{\Theta^t} l(\Theta^t;x_i,y_i) \Bigr)
 $$
-The terms $\nabla_{\Theta^z} l\Bigl(\Theta^t;x_i,y_i\Bigr)$ are composed of $\nabla_{\textbf{W}^{(l)}}l\Bigl(\textbf{W}^{(l)};x_i,y_i\Bigr)$.
+The terms $\nabla_{\Theta^t} l\Bigl(\Theta^t;x_i,y_i\Bigr)$ each contain $\nabla_{\textbf{W}^{(l)}}l\Bigl(\textbf{W}^{(l)};x_i,y_i\Bigr)$.

 \textbf{Problem}: Optimization might fail if:
 $$
    \Bigl\Vert \nabla_{\textbf{W}^{(l)}}l \Bigr\Vert \to \infty \qquad\text{or}\qquad  \Bigl\Vert \nabla_{\textbf{W}^{(l)}}l \Bigr\Vert \to 0
 $$

-\newpage
-
 \textbf{Solution}: $\Vert \nabla_{\textbf{W}^{(l)}}l \Vert$ depends linearly on $\text{diag}\Bigl( \psi'(z^{(l)}) \Bigr)$, so the choice of $\psi$ ($\psi'$) can be used to constrain $\Vert \nabla_{\textbf{W}^{(l)}}l \Vert$\\
 \subtext{Generally, the gradient follows the behaviour of $\psi'$}

@@ -273,6 +280,223 @@ So, the gradient $\Vert \nabla_{\textbf{W}^{(l)}}l \Vert$ also depends on $\Vert
 \textbf{Solution}: Set $h^{(l-1)}$ randomly, bound mean $\mu$ and var. $\sigma^2$.\\
 \subtext{There is no generally optimal bound for $\sigma^2$, it depends on the NN.}

-Some useful distributions for common $\psi$:
+\newpage

-% Table in script
+\remark Practical distributions for common $\psi$:\\
+\subtext{$n_\text{out},n_\text{out}$ are the node counts of the layers adjacent to $w_i$}
+
+% Table in script
+\begin{center}
+    \begin{tabular}{l|l}
+        $\psi$      & \textbf{Weights} \\
+        \hline
+        $\tanh$     & $w_i \sim \mathcal{N}\Bigl( 0, \frac{1}{n_{\text{in}}}            \Bigr)$ \\
+        $\tanh$     & $w_i \sim \mathcal{N}\Bigl( 0, \frac{2}{n_\text{in}+n_\text{out}} \Bigr)$ \\
+        \text{ReLu} & $w_i \sim \mathcal{N}\Bigl( 0, \frac{2}{n_\text{in}}              \Bigr)$
+    \end{tabular}
+\end{center}
+
+\subsubsection{Learning Rate \& Weight Updates}
+$$
+    \Theta^{t+1} = \Theta^{t} - \mu_t \cdot \nabla_{\Theta^t}\Biggl( \frac{1}{|\mathcal{S}|}\sum_{i\in\mathcal{S}}l\Bigl( \Theta^t; x_i,y_i \Bigr) \Biggr)
+$$
+\textbf{Problem}: How to choose $\mu$? (Learning Rate)
+
+\textbf{Solution}: Heuristics.\\
+\subtext{There is no generally optimal $\mu$.}
+
+\method \textbf{Piecewise constant $\mu_t$}
+
+Intuitively, it makes sense to reduce $\mu_t$ as optimization progresses, as the algorithm approaches the minimum.\\
+\subtext{Linear/cosine decay could also be used.}
+$$
+    \mu_t = \begin{cases}
+        1       & 0 \geq t < 3 \\
+        0.5     & 3 \leq t < 6 \\
+        0.25    & 6 \leq t < 9 \\
+        \ldots
+    \end{cases}
+$$   
+
+\method \textbf{Weight update indicator}
+
+In practice, SGD often oscillates in finding the minimum. Then, a monotonic $\mu_t$ doesn't make sense. 
+$$
+    \frac{\Big| \nabla_{\Theta^t} L( \Theta^t;\mathcal{D} ) \Big|}{\Vert \Theta^t \Vert}
+$$
+In general, if this indicator ratio is small, a higher learning rate makes sense (and vice versa).\\
+\subtext{Intuitively: How strongly is the weight change, relative to weight size.}
+
+\method \textbf{Momentum}
+
+Combine the update direction with the previous update directions, for some weight $m > 0$, to stabilize.
+
+\newpage
+
+\subsection{Regularization}
+
+\textbf{Problem}: How can overfitting be avoided?
+
+A few methods can be applied directly to SGD:
+
+\method \textbf{Penalty Term}
+
+Similar to Ridge/LASSO Regression, a penalty term can be used, with some weight $\lambda > 0$.
+$$
+    \underset{\Theta \in \R^d}{\text{arg min}}\Bigl( L(\Theta;\mathcal{D}) \Bigr) \quad \to \quad \underset{\Theta \in \R^d}{\text{arg min}}\Bigl( L(\Theta;\mathcal{D}) + \lambda \Vert\Theta\Vert^2 \Bigr)
+$$
+
+\method \textbf{Earlier stop}
+
+Choosing a different stop criterion for SGD, e.g. performance on the test set $\mathcal{D}'$.
+
+\remark \textbf{Validation \& Training Error}
+
+Overfitting occurs when the training error (on $\mathcal{D}$) continues to fall, but the test error increases (on $\mathcal{D}'$).
+
+\begin{center}
+    \includegraphics[width=0.7\linewidth]{resources/ValidationTrainingErrors.png}\\
+    \color{gray}\footnotesize
+    \textit{Introduction to Machine Learning (2026), p. 196}
+\end{center}
+
+\newpage
+\subsubsection{Dropout Regularization}
+
+This is a method specific to Neural Networks.
+
+\method \textbf{Dropout}
+
+Fix some $p \in (0,1)$. For each SGD iteration in training:\\
+\textit{Drop out} each hidden unit with probability $1-p$, and skip their optimization for this iteration.
+\begin{align*}
+    z_j^{(l)} &= \sum_{i=0}^{n_{i-1}}\Bigl( w_{j,i}^{(l)}h_i^{(l-1)} \Bigr) + b_j^{(l)}                         & \text{(Regular Neuron)}       \\
+    z_j^{(l)} &= \sum_{i=0}^{n_{i-1}}\Bigl( w_{j,i}^{(l)}h_i^{(l-1)} \cdot \mathbb{I}_{C_i} \Bigr) + b_j^{(l)}  & \text{(With Dropout)}
+\end{align*}
+\subtext{Where $C_i:=\{\text{"Unit } h_i^{(l-1)} \text{ is kept this iter."}\}$, so $\P[C_i] = p$}
+
+\textbf{Problem}: For $\mathcal{D}'$, we again want to use all layers.
+
+\textbf{Solution}: Scale all weights with $p$
+
+% This makes sense but I don't get why exactly the scaling is needed
+
+For this, we use $\E\Bigl[ z_j^{(l)} \Bigr]$ instead of $z_j^{(l)}$.
+\begin{align*}
+    \E\Bigl[ z_j^{(l)} \Bigr] = \Bigl( p\cdot w_j^{(l)} \Bigr)^\top \cdot h^{(l-1)} + b_j^{(l)}
+\end{align*}
+\subtext{By using $\E\bigl[ \mathbb{I}_{C_i} \bigr] = \P[C_i] = p$}
+
+\subsubsection{Batch Normalization}
+
+During SGD, the weight's $\sigma^2$ may again explode.\\
+\subtext{After few iterations, $w_i$ may have changed completely from init.}
+
+\textbf{Problem}: Internal covariate shift.\\
+\subtext{The mean $\mu$ deviates from $0$ and $\sigma^2$ might increase}
+
+\textbf{Solution}: Standardize $\psi$ also during training.
+
+\newpage
+
+\method \textbf{Batch Normalization}
+
+% The script doesn't specify how \alpha is set, unfortunately
+
+In pratice, only batches of $\psi$ are normalized. The core idea is to set $\mu \mapsto 0$ and $\sigma^2 \mapsto 1$ within the batch.\\
+\subtext{This isn't optimal for all problems, and can be tweaked using $\beta,\gamma$}
+
+The algorithm uses the parameters $\beta, \gamma$ and buffers $\mu_\text{EMA}, \sigma^2_\text{EMA}$.\\
+\subtext{$\beta,\gamma$ are learnable and can also be optimized}
+
+\textbf{Normalization Step} (Training set)\\
+\smalltext{For a minibatch $\mathcal{S} = \{i_1,\ldots,i_k\}$, the batch is $\{x_{i_1},\ldots,x_{i_k}\}$.}
+
+\textbf{Step 1}: Find current values of $\mu, \sigma^2$.
+\begin{align*}
+    \mu_\mathcal{S}         &:= \frac{1}{|\mathcal{S}|}\sum_{j\in\mathcal{S}}x_j                                    & \text{\color{gray}\footnotesize(minibatch mean)}       \\
+    \sigma^2_\mathcal{S}    &:= \frac{1}{|\mathcal{S}|}\sum_{j\in\mathcal{S}}\Bigl( x_i-\mu_\mathcal{S} \Bigr)^2    & \text{\color{gray}\footnotesize(minibatch variance)} 
+\end{align*}
+\textbf{Step 2}: Update the moving average: $\mu_\text{EMA},\sigma^2_\text{EMA}$.
+\begin{align*}
+    \mu_\text{EMA}          &= (1-\alpha)\mu_\text{EMA} + \alpha \cdot \mu_\mathcal{S}                              & \text{\color{gray}\footnotesize(avg. mean update)}     \\
+    \sigma^2_\text{EMA}     &= (1-\alpha)\sigma^2_\text{EMA}+\alpha\cdot \sigma_\mathcal{S}^2                       & \text{\color{gray}\footnotesize(avg. variance update)} 
+\end{align*}
+\textbf{Step 3}: Update the $x_j$.
+\begin{align*}
+    \hat{x}_j               &= \frac{x_j-\mu_\mathcal{S}}{\sqrt{\sigma^2_\mathcal{S} + \epsilon}}                   & \text{\color{gray}\footnotesize(point normalization)}  \\
+    \bar{x}_j               &= \gamma\cdot\hat{x}_j + \beta                                                         & \text{\color{gray}\footnotesize(scale \& shift)}
+\end{align*}
+
+\textbf{Normalization Step} (Test set)\\
+\smalltext{Only apply step 3, now using the moving average values.}
+\begin{align*}
+    \hat{x}_j               &= \frac{x_j-\mu_\text{EMA}}{\sqrt{\sigma^2_\text{EMA}+\epsilon}}                       & \text{\color{gray}\footnotesize(point normalization)}  \\
+    \bar{x}_j               &= \gamma\cdot\hat{x}_j + \beta                                                         & \text{\color{gray}\footnotesize(scale \& shift)}     
+\end{align*}
+
+\newpage
+
+\subsection{Convolutional Neural Networks}
+
+In fully connected NNs:
+$$
+    h^{(l)} = \psi\Bigl( \textbf{W}^{(l)} h^{(l-1)} \Bigr)
+$$
+Each unit of layer $l-1$ affects each unit of layer $l$.\\
+In CNNs, this is relaxed: not all nodes (must) interact.
+
+\definition \textbf{Convolutional Neural Network} (CNN)\\
+Layers are connected via convolutions.
+$$
+    h^{(l)} = \psi\Bigl( w^{(l)} * h^{(l-1)} \Bigr)
+$$
+
+{\footnotesize
+    \remark In CNNs, the weights are also called \textit{filters}.
+}
+
+\definition \textbf{Convolution} {\footnotesize (Discrete, 2D) } \\
+\subtext{$w \in \R^k,\quad x \in \R^d$}
+$$
+    w * x := \sum_{j=\max\{1,i-d+1\}}^{\min\{i,k\}}\Biggl( w_j\cdot x_{i-j+1} \Biggr)
+$$
+Understanding this is easier by example:
+
+\smalltext{\textbf{Example}: $w = (w_1,w_2)^\top,\quad x=(x_1,x_2,x_3)^\top$}
+{\footnotesize
+$$
+    w*x = \begin{bmatrix}
+        w_1 \cdot x_1           + \color{gray}w_2 \cdot 0     \\
+        w_1 \cdot x_2           + w_2 \cdot x_1               \\
+        w_1 \cdot x_3           + w_2 \cdot x_2               \\
+        \color{gray}w_1 \cdot 0 + \color{black}w_2 \cdot x_3
+    \end{bmatrix}
+$$
+}
+
+\smalltext{\textbf{Example}: a CNN with $3$ inputs and $1$ hidden layer:}
+{\footnotesize
+$$
+    \begin{bmatrix}
+        z_1 \\
+        z_2 \\
+        z_3 \\
+        z_4
+    \end{bmatrix} = \underbrace{\begin{bmatrix}
+        w_1 & 0     & 0     \\
+        w_2 & w_1   & 0     \\
+        0   & w_2   & w_1   \\
+        0   & 0     & w_2   \\
+    \end{bmatrix}}_{\textbf{W}^{(1)} \text{ in CNN}} \cdot \begin{bmatrix}
+        x_1 \\
+        x_2 \\
+        x_3
+    \end{bmatrix} = w * x
+$$
+}
+
+\subsubsection{Multidimensional Convolution}
+\textbf{TODO} add explanation
+
+% The script has a very good intutive walkthrough of how this works.
@@ -0,0 +1,96 @@
+In \textbf{Unsupervised Learning}, $\mathcal{D}$ contains no labels.\\
+Models both define labels \& assign inputs to labels.
+$$
+    \mathcal{D} = \Bigl\{ x_1,\ldots,x_n \Bigr\}    \qquad      \text{\color{gray}\footnotesize(Dataset in unsupervised learning)}
+$$
+There are many use-cases:
+\begin{enumerate}
+    \item Compression
+    \item Discovery of latent variables
+    \item Anomaly detection
+    \item Exploratory data analysis
+\end{enumerate}
+
+\subsection{Clustering}
+
+\definition \textbf{Clustering}
+
+The goal here is to group inputs into clusters, based on some definiton of similarity, e.g. $l_2$ distance for $\mathcal{D} \subset \R^2$.\\
+\subtext{This can be seen as the unsupervised analogy to classification}
+
+\method \textbf{Hierarchical Clustering}
+
+A simple method, using the "similarity" measure directly.
+
+\begin{enumerate}
+    \item Each $x \in \mathcal{D}$ starts in its own cluster
+    \item Iteratively, the $2$ "closest" clusters are merged
+\end{enumerate}
+
+This results in a tree, thus \textit{hierarchical} clustering.
+
+\method \textbf{Partitioning}
+
+In Partitioning methods, a weighted graph is constucted using $\mathcal{D}$ and partitioned using graph theory approaches, i.e. using cuts or spectral analysis.
+
+{\footnotesize
+    \remark Both Hierarchical and Partitioning do not give a natural way to deduce cluster membership for new datapoints.
+}
+
+\newpage
+
+\subsubsection{$k$-Means Clustering}
+
+In $k$-means, a cluster is represented by its center: $\mu_j \in \R^d$.
+The cluster assignment $z_i$ for $x_i \in \mathcal{D}$:
+$$
+    z_i = \underset{j=1,\ldots,k}{\text{arg min}}\Bigl\Vert x_i-\mu_j \Bigr\Vert    \qquad {\color{gray}\footnotesize \text{(Closest center)} }
+$$
+{\footnotesize
+    \remark This strategy induces a partition of $\R^d$. (Voronoi Pattern)
+}
+
+\textbf{Problem}: How to find $\mu = (\mu_1,\ldots,\mu_k)^\top$?
+
+A new optimization objective:
+$$
+    \hat{R}(\mu) = \sum_{i=1}^n \underset{j\in\{1,\ldots,k\}}{\min}\Bigl\Vert x_i-\mu_j \Bigr\Vert^2 = \sum_{i=1}^n \Bigl\Vert x_i-\mu_{z_i} \Bigr\Vert
+$$
+\subtext{(minimize the sum of sq. distances between points \& their centers)}
+
+So we are searching:
+$$
+    \underset{\mu}{\text{arg min}} \Bigl( \hat{R}(\mu) \Bigr) \qquad {\color{gray}\footnotesize \text{(optimal $k$-means cluster)}}
+$$
+\remark This is non-convex \& NP-hard.
+
+\method \textbf{Lloyd's Heuristic}
+
+This is an iterative method to find the cluster centers.
+
+{\footnotesize
+    \definition $z^{(t)} = \Bigl( z_1^{(t)},\ldots,z_n^{(t)} \Bigr)^\top$ \color{gray}(assignment of $x_i$ at iter. $t$)\color{black}
+
+    \definition $\mu^{(t)} = \Bigl( \mu_1^{(t)},\ldots,\mu_k^{(t)}\Bigr)^\top$ \color{gray}(Cluster centers at iter. $t$)\color{black}
+
+    \definition $n_j^{(t)} = \Bigl| \Bigl\{ i=1,\ldots,n\ \Big|\ z_j^{(t)}=j \Bigr\} \Bigr|$ \color{gray}(Size of cluster $j$ at iter. $t$)\color{black}
+}
+
+\begin{algorithm}
+    \caption{Lloyd's Heuristic}
+    $\mu^{(0)}\gets \Bigl( \mu_1^{(0)},\ldots,\mu_k^{(0)} \Bigr)$\;
+    \SetKwRepeat{Do}{repeat}{until}
+        \Do{\text{convergence}}{ 
+            $z_i^{(t)} \gets \underset{j \in \{1,\ldots,k\}}{\text{arg min}}\Bigl\Vert x_i-\mu_j^{(t-1)} \Bigr\Vert\quad\ $ for $i=1,\ldots,n$ \;
+            $\mu_j^{(t)} \gets \frac{1}{n_j^{(t)}}\displaystyle\sum_{i \text{ s.t. } z_{i}^{(t)}=j} x_i\qquad\qquad$ for $j=1,\ldots,k$ \;
+            $t \gets t+1$ \;
+        }
+\end{algorithm}
+{\footnotesize
+    \remark Each iteration is in $\mathcal{O}\bigl( nkd \bigr)$.
+}
+
+% Continue with convergence analysis, k-means++
+
+\newpage
+\subsection{Principal Component Analysis}