mirror of
https://github.com/janishutz/eth-summaries.git
synced 2026-04-28 16:19:23 +02:00
[IML] kernels, 1
This commit is contained in:
Binary file not shown.
@@ -18,4 +18,8 @@
|
|||||||
\section{Classification}
|
\section{Classification}
|
||||||
\input{parts/02_classification.tex}
|
\input{parts/02_classification.tex}
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
\section{Kernels}
|
||||||
|
\input{parts/03_kernels.tex}
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
|||||||
@@ -305,3 +305,47 @@ $$
|
|||||||
$$
|
$$
|
||||||
}
|
}
|
||||||
\subtext{Here $c_\text{FP}, c_\text{FN}$ are the weights for penalization}
|
\subtext{Here $c_\text{FP}, c_\text{FN}$ are the weights for penalization}
|
||||||
|
|
||||||
|
Generally, reducing FP errors increases FN errors, and vice versa.
|
||||||
|
|
||||||
|
% The script had a few more ratios defined here, but they all seem relatively basic so I didn't include them here
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
\subsection{ROC Curves}
|
||||||
|
|
||||||
|
\remark A side-effect of using $\hat{y}(x) = \text{sign}\hat{f}(x)$ is that the magnitude $|\hat{f}(x)|$ can be interpreted as \textit{confidence}.\\
|
||||||
|
We can set:
|
||||||
|
$$
|
||||||
|
\hat{y}_\tau(x) = \text{sign}\Bigl( \hat{f}(x) - \tau \Bigr) = \begin{cases}
|
||||||
|
+1 & \text{if } \hat{f}(x) > \tau \\
|
||||||
|
-1 & \text{if } \hat{f}(x) < \tau
|
||||||
|
\end{cases}
|
||||||
|
$$
|
||||||
|
Now $\tau$ can be used to penalize FP ($\tau > 0$) or FN ($\tau < 0$).\\
|
||||||
|
\subtext{Note how this way, we don't modify the Optimization problem.}
|
||||||
|
|
||||||
|
What if we don't know which FP/TP rate is desired?\\
|
||||||
|
\subtext{Formally: which $\tau$ should be used?}
|
||||||
|
|
||||||
|
\definition \textbf{ROC Curve} (Receiver Operating Characteristic)\\
|
||||||
|
Plots TP Rate against FP Rate for different $\tau$.
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
ROC Curve on 4 classifiers\\
|
||||||
|
\includegraphics[width=0.75\linewidth]{resources/ROC.png}\\
|
||||||
|
{\scriptsize\color{gray}
|
||||||
|
\textit{Introduction to Machine Learning (2026), p. 160}
|
||||||
|
}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
{\scriptsize
|
||||||
|
\remark \textbf{How to read this?} A straight line is equivalent to random guessing, anything above is better.
|
||||||
|
$\tau$ isn't directly included in the curve, but it follows from the definition that $\tau$ decreases as the FP rate increases.
|
||||||
|
}
|
||||||
|
|
||||||
|
How can we measure performance independent of $\tau$?
|
||||||
|
|
||||||
|
\definition \textbf{AUROC} (Area under ROC)\\
|
||||||
|
AUROC is $1$ for the ideal classifier, and always in $[0,1]$.
|
||||||
|
|
||||||
|
% Script further discusses optimizing for minority groups and the notion of fairness in models. Wasn't discussed in class. Might add in summer on 2nd read.
|
||||||
@@ -0,0 +1,95 @@
|
|||||||
|
\textbf{Motivation:} Regression using feature maps $\phi: \R^d \to \R^p$:
|
||||||
|
$$
|
||||||
|
\underset{w \in \R^p}{\min}\frac{1}{n}\sum_{i=1}^{n}l\Bigl( y_i, w^\top \cdot \phi(x_i) \Bigr)
|
||||||
|
$$
|
||||||
|
What if computing/storing $\phi(x)$ is expensive/infeasible?\\
|
||||||
|
\subtext{e.g. if $p$ is large, or infinite}
|
||||||
|
|
||||||
|
{\scriptsize
|
||||||
|
\remark To store a poly. $p(x): \R^d \to \R$ with $\deg(p)=m$ we require $p=\mathcal{O}(d^m)$ features. Storing $n$ data points requires $\mathcal{O}(nd^m)$ memory. Not good.
|
||||||
|
}
|
||||||
|
|
||||||
|
\subsection{Kernelization}
|
||||||
|
|
||||||
|
By constraining $w$ to $\text{span}(\Phi^\top) \subset \R^p$ we can drastically improve memory usage. Since we know a minimizer exists here, we don't "lose anything".
|
||||||
|
|
||||||
|
\definition \textbf{Kernelization}
|
||||||
|
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{Reparametrization}: We assume $w = \Phi^\top\alpha$ (i)
|
||||||
|
\item \textbf{Loss via Inner Products}: Observe:
|
||||||
|
$$
|
||||||
|
f(x) = w^\top \phi(x) \overset{\text{(i)}}{=} (\Phi^\top \alpha)^\top \phi(x) = \sum_{i=1}^{n} \alpha_i \Bigl( \phi(x_i)^\top \phi(x) \Bigr)
|
||||||
|
$$
|
||||||
|
Note: $x_i$ only appears in \textit{inner products} $\phi(x_i)^\top \phi(x_j)$
|
||||||
|
\item \textbf{Replace Inner Products}: We define:
|
||||||
|
$$
|
||||||
|
k:\begin{cases}
|
||||||
|
\R^d\times\R^d\to\R \\
|
||||||
|
k(x,x') = \phi(x)^\top \phi(x')
|
||||||
|
\end{cases}
|
||||||
|
\quad
|
||||||
|
K:\begin{cases}
|
||||||
|
K \in \R^{n\times n} \\
|
||||||
|
K_{ij} = k(x_i,x_j)
|
||||||
|
\end{cases}
|
||||||
|
$$
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
Now, we can reformulate the optimization problem:
|
||||||
|
$$
|
||||||
|
\underset{\alpha\in\R^n}{\min}\frac{1}{n}\sum_{i=1}^{n}l\Biggl( y_i, \sum_{j=1}^{n}\alpha_j k(x_i,x_j) \Biggr) = \underset{\alpha\in\R^n}{\min}\frac{1}{n}\sum_{i=1}^{n}l\Bigl( y_i, (K\alpha)_i \Bigr)
|
||||||
|
$$
|
||||||
|
|
||||||
|
By storing $K \in \R^{n\times n}$ instead of $\phi(x) \in \R^p$ for $i=1,\ldots,n$, the memory usage is reduced: $\mathcal{O}(np) \to \mathcal{O}(n^2)$.
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
\subsection{The Kernel Trick}
|
||||||
|
|
||||||
|
Using $k$, the computation time is still $\mathcal{O}(n^2p)$ if
|
||||||
|
$$
|
||||||
|
k(x_i,x_j) = \phi(x_i)^\top \phi(x_j)
|
||||||
|
$$
|
||||||
|
So let's replace $k$ with a simple function, which guarantees the existance of some $\phi$ (which we never calculate).
|
||||||
|
|
||||||
|
{\scriptsize
|
||||||
|
\remark Since we only \textit{implicitly} specify $\phi$ via $k$, we can use $\phi$ s.t. $p=\infty$ now.
|
||||||
|
}
|
||||||
|
|
||||||
|
\definition \textbf{Kernel Function} $k: \R^d \times \R^d \to \R$
|
||||||
|
\begin{enumerate}
|
||||||
|
\item $k$ is symmetric: $\forall x,x':\ k(x,x') = k(x',x)$
|
||||||
|
\item $k$ is PSD: $\forall n \in \N, \forall (x_1,\ldots,x_n) \in \R^d$:
|
||||||
|
$$
|
||||||
|
K = \begin{bmatrix}
|
||||||
|
k(x_1,x_1) & \cdots & k(x_1,x_n) \\
|
||||||
|
\vdots & \ddots & \vdots \\
|
||||||
|
k(x_n,x_1) & \cdots & k(x_n,x_n)
|
||||||
|
\end{bmatrix} \text{ is PSD}
|
||||||
|
$$
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\theorem \textbf{Kernels guarantee existance of} $\phi$\\
|
||||||
|
\smalltext{If $k$ is a kernel, there exists a Hilbert Space $\Bigl(\mathcal{H},\langle\cdot,\cdot\rangle_\mathcal{H}\Bigr)$ s.t.}
|
||||||
|
$$
|
||||||
|
\exists\phi:\R^d\to\mathcal{H} \text{ s.t. } k(x.x') = \Bigl\langle \phi(x),\phi(x') \Bigr\rangle_\mathcal{H} \forall x,x' \in \R^d
|
||||||
|
$$
|
||||||
|
\subtext{$\mathcal{H}$ may be, for example, $\R^p$ with $\Vert\cdot\Vert_2$.}
|
||||||
|
|
||||||
|
\lemma \textbf{Properties of Kernels}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Composed feature maps are Kernels
|
||||||
|
$$
|
||||||
|
\begin{rcases*}
|
||||||
|
\phi: \R^d \to \R^p \\
|
||||||
|
\psi: \R^d \to \R^p
|
||||||
|
\end{rcases*}
|
||||||
|
\quad k(x,x') = \Bigl\langle \psi\bigl( \phi(x) \bigr), \psi\bigl( \phi(x') \bigr) \Bigr\rangle
|
||||||
|
$$
|
||||||
|
\item Kernels can be added in 2 ways, yielding a kernel
|
||||||
|
\begin{align*}
|
||||||
|
\text{(i)}\quad & k\Bigl( (x,y),(x',y') \Bigr) &= k_1(x,x') + k_2(y,y') \\
|
||||||
|
\text{(ii)}\quad & k(x,x') &= k_1(x,x') + k_2(x,x')
|
||||||
|
\end{align*}
|
||||||
|
\item Kernels can be multiplied in 2 ways, yielding a kernel
|
||||||
|
\end{enumerate}
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 27 KiB |
Reference in New Issue
Block a user