diff --git a/semester6/iml/main.pdf b/semester6/iml/main.pdf index 527949c..09c16a6 100644 Binary files a/semester6/iml/main.pdf and b/semester6/iml/main.pdf differ diff --git a/semester6/iml/main.tex b/semester6/iml/main.tex index 7517fa6..f3801ef 100644 --- a/semester6/iml/main.tex +++ b/semester6/iml/main.tex @@ -18,4 +18,8 @@ \section{Classification} \input{parts/02_classification.tex} +\newpage +\section{Kernels} +\input{parts/03_kernels.tex} + \end{document} diff --git a/semester6/iml/parts/02_classification.tex b/semester6/iml/parts/02_classification.tex index 8c8dfbd..fc49ae3 100644 --- a/semester6/iml/parts/02_classification.tex +++ b/semester6/iml/parts/02_classification.tex @@ -304,4 +304,48 @@ $$ \frac{c_\text{FN}}{|\{ x \sep y=+1 \}|} \underbrace{\sum_{(x,y), y=1} \I_{\hat{y}\neq -1}}_\text{\#FN} + \frac{c_\text{FP}}{|\{x \sep y =-1 \}|}\underbrace{\sum_{(x,y), y=-1} \I_{\hat{y}(x)=+1}}_\text{\#FP} $$ } -\subtext{Here $c_\text{FP}, c_\text{FN}$ are the weights for penalization} \ No newline at end of file +\subtext{Here $c_\text{FP}, c_\text{FN}$ are the weights for penalization} + +Generally, reducing FP errors increases FN errors, and vice versa. + +% The script had a few more ratios defined here, but they all seem relatively basic so I didn't include them here + +\newpage +\subsection{ROC Curves} + +\remark A side-effect of using $\hat{y}(x) = \text{sign}\hat{f}(x)$ is that the magnitude $|\hat{f}(x)|$ can be interpreted as \textit{confidence}.\\ +We can set: +$$ + \hat{y}_\tau(x) = \text{sign}\Bigl( \hat{f}(x) - \tau \Bigr) = \begin{cases} + +1 & \text{if } \hat{f}(x) > \tau \\ + -1 & \text{if } \hat{f}(x) < \tau + \end{cases} +$$ +Now $\tau$ can be used to penalize FP ($\tau > 0$) or FN ($\tau < 0$).\\ +\subtext{Note how this way, we don't modify the Optimization problem.} + +What if we don't know which FP/TP rate is desired?\\ +\subtext{Formally: which $\tau$ should be used?} + +\definition \textbf{ROC Curve} (Receiver Operating Characteristic)\\ +Plots TP Rate against FP Rate for different $\tau$. + +\begin{center} + ROC Curve on 4 classifiers\\ + \includegraphics[width=0.75\linewidth]{resources/ROC.png}\\ + {\scriptsize\color{gray} + \textit{Introduction to Machine Learning (2026), p. 160} + } +\end{center} + +{\scriptsize + \remark \textbf{How to read this?} A straight line is equivalent to random guessing, anything above is better. + $\tau$ isn't directly included in the curve, but it follows from the definition that $\tau$ decreases as the FP rate increases. +} + +How can we measure performance independent of $\tau$? + +\definition \textbf{AUROC} (Area under ROC)\\ +AUROC is $1$ for the ideal classifier, and always in $[0,1]$. + +% Script further discusses optimizing for minority groups and the notion of fairness in models. Wasn't discussed in class. Might add in summer on 2nd read. \ No newline at end of file diff --git a/semester6/iml/parts/03_kernels.tex b/semester6/iml/parts/03_kernels.tex new file mode 100644 index 0000000..e4325f9 --- /dev/null +++ b/semester6/iml/parts/03_kernels.tex @@ -0,0 +1,95 @@ +\textbf{Motivation:} Regression using feature maps $\phi: \R^d \to \R^p$: +$$ + \underset{w \in \R^p}{\min}\frac{1}{n}\sum_{i=1}^{n}l\Bigl( y_i, w^\top \cdot \phi(x_i) \Bigr) +$$ +What if computing/storing $\phi(x)$ is expensive/infeasible?\\ +\subtext{e.g. if $p$ is large, or infinite} + +{\scriptsize + \remark To store a poly. $p(x): \R^d \to \R$ with $\deg(p)=m$ we require $p=\mathcal{O}(d^m)$ features. Storing $n$ data points requires $\mathcal{O}(nd^m)$ memory. Not good. +} + +\subsection{Kernelization} + +By constraining $w$ to $\text{span}(\Phi^\top) \subset \R^p$ we can drastically improve memory usage. Since we know a minimizer exists here, we don't "lose anything". + +\definition \textbf{Kernelization} + +\begin{enumerate} + \item \textbf{Reparametrization}: We assume $w = \Phi^\top\alpha$ (i) + \item \textbf{Loss via Inner Products}: Observe: + $$ + f(x) = w^\top \phi(x) \overset{\text{(i)}}{=} (\Phi^\top \alpha)^\top \phi(x) = \sum_{i=1}^{n} \alpha_i \Bigl( \phi(x_i)^\top \phi(x) \Bigr) + $$ + Note: $x_i$ only appears in \textit{inner products} $\phi(x_i)^\top \phi(x_j)$ + \item \textbf{Replace Inner Products}: We define: + $$ + k:\begin{cases} + \R^d\times\R^d\to\R \\ + k(x,x') = \phi(x)^\top \phi(x') + \end{cases} + \quad + K:\begin{cases} + K \in \R^{n\times n} \\ + K_{ij} = k(x_i,x_j) + \end{cases} + $$ +\end{enumerate} + +Now, we can reformulate the optimization problem: +$$ + \underset{\alpha\in\R^n}{\min}\frac{1}{n}\sum_{i=1}^{n}l\Biggl( y_i, \sum_{j=1}^{n}\alpha_j k(x_i,x_j) \Biggr) = \underset{\alpha\in\R^n}{\min}\frac{1}{n}\sum_{i=1}^{n}l\Bigl( y_i, (K\alpha)_i \Bigr) +$$ + +By storing $K \in \R^{n\times n}$ instead of $\phi(x) \in \R^p$ for $i=1,\ldots,n$, the memory usage is reduced: $\mathcal{O}(np) \to \mathcal{O}(n^2)$. + +\newpage +\subsection{The Kernel Trick} + +Using $k$, the computation time is still $\mathcal{O}(n^2p)$ if +$$ + k(x_i,x_j) = \phi(x_i)^\top \phi(x_j) +$$ +So let's replace $k$ with a simple function, which guarantees the existance of some $\phi$ (which we never calculate). + +{\scriptsize + \remark Since we only \textit{implicitly} specify $\phi$ via $k$, we can use $\phi$ s.t. $p=\infty$ now. +} + +\definition \textbf{Kernel Function} $k: \R^d \times \R^d \to \R$ +\begin{enumerate} + \item $k$ is symmetric: $\forall x,x':\ k(x,x') = k(x',x)$ + \item $k$ is PSD: $\forall n \in \N, \forall (x_1,\ldots,x_n) \in \R^d$: + $$ + K = \begin{bmatrix} + k(x_1,x_1) & \cdots & k(x_1,x_n) \\ + \vdots & \ddots & \vdots \\ + k(x_n,x_1) & \cdots & k(x_n,x_n) + \end{bmatrix} \text{ is PSD} + $$ +\end{enumerate} + +\theorem \textbf{Kernels guarantee existance of} $\phi$\\ +\smalltext{If $k$ is a kernel, there exists a Hilbert Space $\Bigl(\mathcal{H},\langle\cdot,\cdot\rangle_\mathcal{H}\Bigr)$ s.t.} +$$ + \exists\phi:\R^d\to\mathcal{H} \text{ s.t. } k(x.x') = \Bigl\langle \phi(x),\phi(x') \Bigr\rangle_\mathcal{H} \forall x,x' \in \R^d +$$ +\subtext{$\mathcal{H}$ may be, for example, $\R^p$ with $\Vert\cdot\Vert_2$.} + +\lemma \textbf{Properties of Kernels} +\begin{enumerate} + \item Composed feature maps are Kernels + $$ + \begin{rcases*} + \phi: \R^d \to \R^p \\ + \psi: \R^d \to \R^p + \end{rcases*} + \quad k(x,x') = \Bigl\langle \psi\bigl( \phi(x) \bigr), \psi\bigl( \phi(x') \bigr) \Bigr\rangle + $$ + \item Kernels can be added in 2 ways, yielding a kernel + \begin{align*} + \text{(i)}\quad & k\Bigl( (x,y),(x',y') \Bigr) &= k_1(x,x') + k_2(y,y') \\ + \text{(ii)}\quad & k(x,x') &= k_1(x,x') + k_2(x,x') + \end{align*} + \item Kernels can be multiplied in 2 ways, yielding a kernel +\end{enumerate} \ No newline at end of file diff --git a/semester6/iml/resources/ROC.png b/semester6/iml/resources/ROC.png new file mode 100644 index 0000000..7db7d74 Binary files /dev/null and b/semester6/iml/resources/ROC.png differ