[AMR] Finish W13 lecture summary

2026-07-27 21:29:09 +02:00 · 2026-05-25 08:39:50 +02:00
parent df0dfcb1ef
commit c06e4d6c83
5 changed files with 60 additions and 3 deletions
@@ -26,7 +26,7 @@
        \EndProcedure
    \end{algorithmic}
 \end{algorithm}
-Returns a collision-free path as grapha. Need nearest neighbour search.
+Returns a collision-free path as graph. Need nearest neighbour search.
 Extension to RRT* to make path better (because this is quite bad)
 \begin{algorithm}
    \begin{algorithmic}[1]
@@ -1,4 +1,2 @@
 \subsection{Learning To Act}
 Used if there is no state-transition model or cost/reward func.
 \bi{Markov Decision Process}
@@ -0,0 +1,31 @@
 \subsubsection{Markov Decision Process}
 The goal is to maximize the reward. Along the route, get small reward,
 at the end large reward (good or bad).
 Def. by states $\vec{x} \in \cX$ (RL: $s$), actions $\vec{u} \in \cU$ (RL: $a$), prob. state trans. $\cT(\vec{x}, \vec{u}, \vec{x}_+) = \P(\vec{x}_+ \divider \vec{x}, \vec{u})$,
 reward func $\cR(\vec{x}, \vec{u}, \vec{x}_+)$, start state $\vec{x}_0$, optional terminal state $\vec{x}_N$.
 \bi{Utility Func} Expected reward: $V = \sum_{k = 0}^{N} r_k$ or \textit{discounted} reward  $V = \sum_{k = 0}^{\8} \gamma^k r_k$ with $\gamma < 1$
 \bi{Solving} (Val iter) $V_0(\vec{x}) = 0$ and $V_{i + 1}(\vec{x}) = \max_{\vec{u}} Q(\vec{x}, \vec{u})$ with
 \[
    Q(\vec{x}, \vec{u}) = \sum_{\vec{x}_+} \P(\vec{x}_+ \divider \vec{x}, \vec{u}) [\cR(\vec{x}, \vec{u}, \vec{x}^+) + \gamma V_i(\vec{x}_+)]
 \]
 Repeat until conv. to $V^*$ ($\tco{|\cU||\cX|^2}$ per iter). Optimal policy:
 \[
    \vec{\pi}^*(\vec{x}) = \text{argmax}_{\vec{u}} Q(\vec{x}, \vec{u})
 \]
 Using policy iter:
 \begin{algorithm}
    \begin{algorithmic}[1]
        \State Choose $\vec{\pi}_0(\vec{x})$
        \While{\textit{policy} has not converged}
        \Repeat $V_{i + 1}^{\vec{\pi}_j}(\vec{x}) = Q(\vec{x}, \vec{\pi}(\vec{x}))$ $\forall \vec{x}$ and \textit{fixed} pol. $\vec{\pi}_j$
            \Until{values converge}
        \EndWhile
        \State One step: $\vec{\pi}_{j + 1}(\vec{x}) = \text{argmax}_{\vec{u}} Q(\vec{x}. \vec{u})$ with $V_i = V_{i + 1}^{\pi_j}$
    \end{algorithmic}
 \end{algorithm}
 Model-based learning uses empirical models of $\cT$ and $\cR$
@@ -0,0 +1,28 @@
 \subsubsection{Reinforcement Learning}
 \bi{Passive} \textit{Direct Evaluation} Act according to policy $\vec{\pi}$, store sum of discounted rewards, average them. (But too simple)
 \textit{Sample-Based} Use $V_{i + 1}^{\vec{\pi}}(\vec{x}) = Q(\vec{x}, \vec{\pi}(\vec{x}))$ w/ $V_0^\pi(\vec{x}) = 0$.
 We need state trans. model, instead $\tilde{R}_j$ (approx. prob. w/ statistics) and thus
 \[
    V_{i + 1}^{\vec{x}}(\vec{x}) = \frac{1}{N} \sum_{j = 1}^{N} \tilde{R}_j(\vec{x}, \vec{\pi}(\vec{x}), \vec{x}_+) + \gamma V_i^{\vec{\pi}}(\vec{x}_+)
 \]
 \bi{Active} Find optimal policy $\vec{\pi}$ instead of state values $V(\vec{x})$.
 \textit{Q-Learn.} Here: restate Val. Iter in $Q$-Values ($Q_0(\vec{x}, \vec{u}) = 0$):
 \[
    Q_{i + 1}(\vec{x}, \vec{u}) = Q(\vec{x}, \vec{u}) \quad \text{with } V_i(\vec{x}_+) = \max_{\vec{u}}(\vec{Q}_i(\vec{x}_+, \vec{u}))
 \]
 Compute using sample:
 \[
    \tilde{Q}_i(\vec{x}, \vec{u}) = \tilde{R}_i(\vec{x}, \vec{u}, \vec{x}_+) + \gamma \max_{\vec{u}}(\vec{Q}_i(\vec{x}_+, \vec{u}))
 \]
 Then update: $Q_{i + 1}(\vec{x}, \vec{u}) = (1 - \alpha)Q_i(\vec{x}, \vec{u}) + \alpha \tilde{Q}_i(\vec{x}, \vec{u})$.
 Called off-policy learning, needs exploration. Simplest is random actions ($\varepsilon$-greedy):
 $\varepsilon$ is prob. to act randomly, $1 - \varepsilon$ is prob. to act on pol. {\color{red} Space explored, still doing random stuff}
 \bi{Approaches} 
 \textit{Model-based} (estimate trans. model, e.g. Dyna),
 Value-based (estimate val or $Q$-func and extract pol., e.g. Q-Learn),
 Actor-Critic (estim. val or $Q$ of curr. pol., improve pol., e.g. A3C, SAC),
 Policy-Gradient (diff. expect. reward w.r.t. params of policy network, e.g. REINFORCE)