mirror of
https://github.com/janishutz/eth-summaries.git
synced 2026-04-28 16:19:23 +02:00
[PS] fixes
This commit is contained in:
Binary file not shown.
@@ -29,6 +29,12 @@ $$
|
|||||||
\forall a \in \R:\qquad F_X(a) = \P[X \leq a]
|
\forall a \in \R:\qquad F_X(a) = \P[X \leq a]
|
||||||
$$
|
$$
|
||||||
|
|
||||||
|
{\scriptsize
|
||||||
|
\notation $\underset{x \to a^-}{\lim}F_X(a) = F(a^-) = \P[X < a]$
|
||||||
|
|
||||||
|
\lemma $\P[a < X \leq b] = F_X(b) - F_X(a)$
|
||||||
|
}
|
||||||
|
|
||||||
\theorem \textbf{Eigenschaftern der Verteilungsfunktion}
|
\theorem \textbf{Eigenschaftern der Verteilungsfunktion}
|
||||||
|
|
||||||
\begin{tabular}{ll}
|
\begin{tabular}{ll}
|
||||||
@@ -43,6 +49,8 @@ $$
|
|||||||
\P[X_1 \leq x_q,\cdots, X_n \leq x_n] = \P[X_1 \leq x_1] \cdots \P[X_n \leq x_n]
|
\P[X_1 \leq x_q,\cdots, X_n \leq x_n] = \P[X_1 \leq x_1] \cdots \P[X_n \leq x_n]
|
||||||
$$
|
$$
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
|
||||||
\theorem \textbf{Unabhängigkeit von Gruppierungen}\\
|
\theorem \textbf{Unabhängigkeit von Gruppierungen}\\
|
||||||
\smalltext{$X_1,\cdots X_n$ sind unabhängig, dann sind auch $Y_1,\cdots,Y_k$ unabhängig:}
|
\smalltext{$X_1,\cdots X_n$ sind unabhängig, dann sind auch $Y_1,\cdots,Y_k$ unabhängig:}
|
||||||
$$
|
$$
|
||||||
@@ -50,8 +58,6 @@ $$
|
|||||||
$$
|
$$
|
||||||
\subtext{$1 \leq i_1 < i_2 < \cdots < i_k \leq n \text{ sind Indizes},\quad \phi_1,\cdots,\phi_k \text{ sind Abbildungen}$}
|
\subtext{$1 \leq i_1 < i_2 < \cdots < i_k \leq n \text{ sind Indizes},\quad \phi_1,\cdots,\phi_k \text{ sind Abbildungen}$}
|
||||||
|
|
||||||
\newpage
|
|
||||||
|
|
||||||
\definition \textbf{Folgen von Zufallsvariablen}
|
\definition \textbf{Folgen von Zufallsvariablen}
|
||||||
|
|
||||||
\begin{tabular}{llll}
|
\begin{tabular}{llll}
|
||||||
@@ -224,6 +230,3 @@ $$
|
|||||||
$$
|
$$
|
||||||
|
|
||||||
\definition \textbf{Exponentialverteilung}
|
\definition \textbf{Exponentialverteilung}
|
||||||
$$
|
|
||||||
|
|
||||||
$$
|
|
||||||
Binary file not shown.
@@ -9,7 +9,8 @@
|
|||||||
|
|
||||||
\begin{document}
|
\begin{document}
|
||||||
|
|
||||||
\section{Intro}
|
|
||||||
\input{parts/00_intro.tex}
|
\input{parts/00_intro.tex}
|
||||||
|
\section{Regression}
|
||||||
|
\input{parts/01_regression.tex}
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
|||||||
@@ -1,4 +1 @@
|
|||||||
\textbf{placeholder}
|
\textbf{placeholder}
|
||||||
$$
|
|
||||||
a = x\cdot 16
|
|
||||||
$$
|
|
||||||
@@ -0,0 +1,103 @@
|
|||||||
|
\subsection{Supervised Learning}
|
||||||
|
|
||||||
|
\textbf{Supervised Learning} is the task of 'learning' a function relationship, based on a given set of inputs/outputs.
|
||||||
|
|
||||||
|
Some terminology:
|
||||||
|
|
||||||
|
\begin{tabular}{ll}
|
||||||
|
$x \in \R^d$ & Inputs (Attributes/Covariates) \\
|
||||||
|
$\phi(x) \in \R^p$ & Features \\
|
||||||
|
$y \in \R$ & Outputs (Targets/Labels) \\
|
||||||
|
$D = \{ (x_i,y_i) \}_{i=1}^n$ & Training Set \\
|
||||||
|
$D'$ & Test Set \\
|
||||||
|
$f: \R^p \to \R$ & Predictor (Model) \\
|
||||||
|
$l(f(x), y)$ & Loss
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
\textbf{Machine Learning Pipelines} can often be classified using:
|
||||||
|
|
||||||
|
\begin{tabular}{ll}
|
||||||
|
$F$ & Function Class \\
|
||||||
|
$L(f)$ & Training Loss \\
|
||||||
|
& Optimization Method
|
||||||
|
\end{tabular}
|
||||||
|
|
||||||
|
{\small
|
||||||
|
The function class $F$ is a set of parametrized functions.
|
||||||
|
We are looking for the $f \in F$ that minimizes $L(f)$.
|
||||||
|
}
|
||||||
|
|
||||||
|
\definition \textbf{Training Loss}
|
||||||
|
$$
|
||||||
|
L(f) := \frac{1}{n}\sum_{i=1}^{n} l\bigl( f(x_i), y \bigr)
|
||||||
|
$$
|
||||||
|
|
||||||
|
\subsection{Multiple Linear Regression}
|
||||||
|
|
||||||
|
\textbf{Multiple Linear Regression} directly uses the $x \in \R^d$. \\
|
||||||
|
Here, $F_\text{affine} = \bigl\{ f(x) = w^\top x + w_0 \big| w \in \R^d, w_0 \in \R \bigr\}$.
|
||||||
|
|
||||||
|
Why are we using linear functions instead?\\
|
||||||
|
{\scriptsize
|
||||||
|
Any estimator $f \in F_\text{affine}$ can be rewritten as $f\bigl((x,1)\bigr) = (w,w_0)^\top\cdot(x,1)$,
|
||||||
|
thus we can augment the inpurs $x \mapsto (x,1)$ and \\
|
||||||
|
instead search in $F_\text{linear} = \{ f(x) = \hat{w}^\top x | \hat{w} \in \R^{d+1} \}$
|
||||||
|
}
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
\textbf{Loss Functions}
|
||||||
|
|
||||||
|
\definition \textbf{Squared Loss} $\quad l\bigl( f(x),y \bigr) := \bigl( f(x) - y \bigr)^2$\\
|
||||||
|
\subtext{Most common Loss Function, but sensitive to outliers.}
|
||||||
|
|
||||||
|
\definition \textbf{Absolute Loss} $\quad l_\text{abs}\bigl( f(x),y \bigr) := |f(x)-y|$\\
|
||||||
|
\subtext{Less sensitive to outliers, but not differentiable.}
|
||||||
|
|
||||||
|
\definition \textbf{Huber Loss}
|
||||||
|
$$
|
||||||
|
l_\text{huber}\bigl( f(x),y \bigr) := \begin{cases}
|
||||||
|
\frac{1}{2}\bigl( f(x)-y \bigr)^2 & |f(x)-y| \leq \delta \\
|
||||||
|
\delta \bigl( |f(x)-y| - \frac{1}{2}\delta \bigr) & |f(x)-y| > \delta
|
||||||
|
\end{cases}
|
||||||
|
$$
|
||||||
|
\subtext{Using parameter $\delta$, the penalization of outliers can be controlled}
|
||||||
|
|
||||||
|
\textbf{Assymetric Loss}: In some cases it is desirable to penalize overestimation harder than underestimation, or vice versa.
|
||||||
|
|
||||||
|
\definition \textbf{Quantile Loss}
|
||||||
|
$$
|
||||||
|
l_\tau\bigl( f(x),y \bigr) := \tau \max\Bigl\{ y-f(x),0 \Bigr\} + (1-\tau)\max\Bigl\{ f(x)-y, 0 \Bigr\}
|
||||||
|
$$
|
||||||
|
\subtext{Using parameter $\tau$, over/underestimation can be penalized}
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
\textbf{Linear Regression}
|
||||||
|
|
||||||
|
To find $\hat{f} := \underset{f \in F_\text{linear}}{\text{arg min}} L(f)$ we just look for $w \in \R^d$.
|
||||||
|
$$
|
||||||
|
\hat{w} := \underset{w \in \R^d}{\text{arg min}} L(f_w) = \frac{1}{n}\sum_{i=1}^{n}\underbrace{\Bigl( y_i - w^\top x_i \Bigr)^2}_{l\bigl(f(x_i), y_i\bigr)}
|
||||||
|
$$
|
||||||
|
\subtext{A natural abuse of notation here is $L(w) := L(f_w)$.}
|
||||||
|
|
||||||
|
This can be rewritten in matrix notation:
|
||||||
|
$$
|
||||||
|
\sum_{i=1}^{n}\Bigl( y_i - w^\top x_i \Bigr)^2 = \bigl\Vert y-Xw \bigr\Vert^2
|
||||||
|
$$
|
||||||
|
\subtext{The factor $\frac{1}{n}$ is irrelevant for Optimization, it doesn't depend on $w$}
|
||||||
|
|
||||||
|
So we find the usual problem:
|
||||||
|
$$
|
||||||
|
\hat{w} = \underset{w \in \R^d}{\text{arg min}} \bigl\Vert y-Xw \bigr\Vert^2
|
||||||
|
$$
|
||||||
|
The solution is a stationary point, so:
|
||||||
|
$$
|
||||||
|
\nabla_w \bigl\Vert y-Xw \bigr\Vert^2 = 2X^\top(X\hat{w}-y) = 0
|
||||||
|
$$
|
||||||
|
Which yields the known \textbf{Normal Equation}
|
||||||
|
$$
|
||||||
|
X^\top X\hat{w} = X^\top y
|
||||||
|
$$
|
||||||
|
|
||||||
|
|
||||||
@@ -61,11 +61,11 @@
|
|||||||
\def \F{\mathcal{F}}
|
\def \F{\mathcal{F}}
|
||||||
|
|
||||||
% Titles
|
% Titles
|
||||||
\def \definition{\colorbox{lightgray}{Def} }
|
\def \definition{\colorbox{lightgray}{D.} }
|
||||||
\def \notation{\colorbox{lightgray}{Notation} }
|
\def \notation{\colorbox{lightgray}{Notation} }
|
||||||
\def \remark{\colorbox{lightgray}{Remark} }
|
\def \remark{\colorbox{lightgray}{Rmk.} }
|
||||||
\def \theorem{\colorbox{lightgray}{Th.} }
|
\def \theorem{\colorbox{lightgray}{Th.} }
|
||||||
\def \lemma{\colorbox{lightgray}{Lem.} }
|
\def \lemma{\colorbox{lightgray}{L.} }
|
||||||
\def \method{\colorbox{lightgray}{Method} }
|
\def \method{\colorbox{lightgray}{Method} }
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user