mirror of
https://github.com/janishutz/eth-summaries.git
synced 2026-05-30 16:21:19 +02:00
[AMR] Finish W13 lecture summary
This commit is contained in:
Binary file not shown.
@@ -26,7 +26,7 @@
|
|||||||
\EndProcedure
|
\EndProcedure
|
||||||
\end{algorithmic}
|
\end{algorithmic}
|
||||||
\end{algorithm}
|
\end{algorithm}
|
||||||
Returns a collision-free path as grapha. Need nearest neighbour search.
|
Returns a collision-free path as graph. Need nearest neighbour search.
|
||||||
Extension to RRT* to make path better (because this is quite bad)
|
Extension to RRT* to make path better (because this is quite bad)
|
||||||
\begin{algorithm}
|
\begin{algorithm}
|
||||||
\begin{algorithmic}[1]
|
\begin{algorithmic}[1]
|
||||||
|
|||||||
@@ -1,4 +1,2 @@
|
|||||||
\subsection{Learning To Act}
|
\subsection{Learning To Act}
|
||||||
Used if there is no state-transition model or cost/reward func.
|
Used if there is no state-transition model or cost/reward func.
|
||||||
|
|
||||||
\bi{Markov Decision Process}
|
|
||||||
|
|||||||
@@ -0,0 +1,31 @@
|
|||||||
|
\subsubsection{Markov Decision Process}
|
||||||
|
The goal is to maximize the reward. Along the route, get small reward,
|
||||||
|
at the end large reward (good or bad).
|
||||||
|
|
||||||
|
Def. by states $\vec{x} \in \cX$ (RL: $s$), actions $\vec{u} \in \cU$ (RL: $a$), prob. state trans. $\cT(\vec{x}, \vec{u}, \vec{x}_+) = \P(\vec{x}_+ \divider \vec{x}, \vec{u})$,
|
||||||
|
reward func $\cR(\vec{x}, \vec{u}, \vec{x}_+)$, start state $\vec{x}_0$, optional terminal state $\vec{x}_N$.
|
||||||
|
|
||||||
|
\bi{Utility Func} Expected reward: $V = \sum_{k = 0}^{N} r_k$ or \textit{discounted} reward $V = \sum_{k = 0}^{\8} \gamma^k r_k$ with $\gamma < 1$
|
||||||
|
|
||||||
|
\bi{Solving} (Val iter) $V_0(\vec{x}) = 0$ and $V_{i + 1}(\vec{x}) = \max_{\vec{u}} Q(\vec{x}, \vec{u})$ with
|
||||||
|
\[
|
||||||
|
Q(\vec{x}, \vec{u}) = \sum_{\vec{x}_+} \P(\vec{x}_+ \divider \vec{x}, \vec{u}) [\cR(\vec{x}, \vec{u}, \vec{x}^+) + \gamma V_i(\vec{x}_+)]
|
||||||
|
\]
|
||||||
|
Repeat until conv. to $V^*$ ($\tco{|\cU||\cX|^2}$ per iter). Optimal policy:
|
||||||
|
\[
|
||||||
|
\vec{\pi}^*(\vec{x}) = \text{argmax}_{\vec{u}} Q(\vec{x}, \vec{u})
|
||||||
|
\]
|
||||||
|
|
||||||
|
Using policy iter:
|
||||||
|
\begin{algorithm}
|
||||||
|
\begin{algorithmic}[1]
|
||||||
|
\State Choose $\vec{\pi}_0(\vec{x})$
|
||||||
|
\While{\textit{policy} has not converged}
|
||||||
|
\Repeat $V_{i + 1}^{\vec{\pi}_j}(\vec{x}) = Q(\vec{x}, \vec{\pi}(\vec{x}))$ $\forall \vec{x}$ and \textit{fixed} pol. $\vec{\pi}_j$
|
||||||
|
\Until{values converge}
|
||||||
|
\EndWhile
|
||||||
|
\State One step: $\vec{\pi}_{j + 1}(\vec{x}) = \text{argmax}_{\vec{u}} Q(\vec{x}. \vec{u})$ with $V_i = V_{i + 1}^{\pi_j}$
|
||||||
|
\end{algorithmic}
|
||||||
|
\end{algorithm}
|
||||||
|
|
||||||
|
Model-based learning uses empirical models of $\cT$ and $\cR$
|
||||||
|
|||||||
+28
@@ -0,0 +1,28 @@
|
|||||||
|
\subsubsection{Reinforcement Learning}
|
||||||
|
\bi{Passive} \textit{Direct Evaluation} Act according to policy $\vec{\pi}$, store sum of discounted rewards, average them. (But too simple)
|
||||||
|
|
||||||
|
\textit{Sample-Based} Use $V_{i + 1}^{\vec{\pi}}(\vec{x}) = Q(\vec{x}, \vec{\pi}(\vec{x}))$ w/ $V_0^\pi(\vec{x}) = 0$.
|
||||||
|
We need state trans. model, instead $\tilde{R}_j$ (approx. prob. w/ statistics) and thus
|
||||||
|
\[
|
||||||
|
V_{i + 1}^{\vec{x}}(\vec{x}) = \frac{1}{N} \sum_{j = 1}^{N} \tilde{R}_j(\vec{x}, \vec{\pi}(\vec{x}), \vec{x}_+) + \gamma V_i^{\vec{\pi}}(\vec{x}_+)
|
||||||
|
\]
|
||||||
|
|
||||||
|
\bi{Active} Find optimal policy $\vec{\pi}$ instead of state values $V(\vec{x})$.
|
||||||
|
|
||||||
|
\textit{Q-Learn.} Here: restate Val. Iter in $Q$-Values ($Q_0(\vec{x}, \vec{u}) = 0$):
|
||||||
|
\[
|
||||||
|
Q_{i + 1}(\vec{x}, \vec{u}) = Q(\vec{x}, \vec{u}) \quad \text{with } V_i(\vec{x}_+) = \max_{\vec{u}}(\vec{Q}_i(\vec{x}_+, \vec{u}))
|
||||||
|
\]
|
||||||
|
Compute using sample:
|
||||||
|
\[
|
||||||
|
\tilde{Q}_i(\vec{x}, \vec{u}) = \tilde{R}_i(\vec{x}, \vec{u}, \vec{x}_+) + \gamma \max_{\vec{u}}(\vec{Q}_i(\vec{x}_+, \vec{u}))
|
||||||
|
\]
|
||||||
|
Then update: $Q_{i + 1}(\vec{x}, \vec{u}) = (1 - \alpha)Q_i(\vec{x}, \vec{u}) + \alpha \tilde{Q}_i(\vec{x}, \vec{u})$.
|
||||||
|
Called off-policy learning, needs exploration. Simplest is random actions ($\varepsilon$-greedy):
|
||||||
|
$\varepsilon$ is prob. to act randomly, $1 - \varepsilon$ is prob. to act on pol. {\color{red} Space explored, still doing random stuff}
|
||||||
|
|
||||||
|
\bi{Approaches}
|
||||||
|
\textit{Model-based} (estimate trans. model, e.g. Dyna),
|
||||||
|
Value-based (estimate val or $Q$-func and extract pol., e.g. Q-Learn),
|
||||||
|
Actor-Critic (estim. val or $Q$ of curr. pol., improve pol., e.g. A3C, SAC),
|
||||||
|
Policy-Gradient (diff. expect. reward w.r.t. params of policy network, e.g. REINFORCE)
|
||||||
|
|||||||
Reference in New Issue
Block a user