diff --git a/electives/amr/autonomous-mobile-robots-cheatsheet.pdf b/electives/amr/autonomous-mobile-robots-cheatsheet.pdf index e1b3718..182d5f2 100644 Binary files a/electives/amr/autonomous-mobile-robots-cheatsheet.pdf and b/electives/amr/autonomous-mobile-robots-cheatsheet.pdf differ diff --git a/electives/amr/parts/05_planning-control/01_motion-planning-exploration/02_rrt.tex b/electives/amr/parts/05_planning-control/01_motion-planning-exploration/02_rrt.tex index ab4f9fe..1d85960 100644 --- a/electives/amr/parts/05_planning-control/01_motion-planning-exploration/02_rrt.tex +++ b/electives/amr/parts/05_planning-control/01_motion-planning-exploration/02_rrt.tex @@ -26,7 +26,7 @@ \EndProcedure \end{algorithmic} \end{algorithm} -Returns a collision-free path as grapha. Need nearest neighbour search. +Returns a collision-free path as graph. Need nearest neighbour search. Extension to RRT* to make path better (because this is quite bad) \begin{algorithm} \begin{algorithmic}[1] diff --git a/electives/amr/parts/05_planning-control/02_learning-to-act/00_intro.tex b/electives/amr/parts/05_planning-control/02_learning-to-act/00_intro.tex index b314db0..f2d999a 100644 --- a/electives/amr/parts/05_planning-control/02_learning-to-act/00_intro.tex +++ b/electives/amr/parts/05_planning-control/02_learning-to-act/00_intro.tex @@ -1,4 +1,2 @@ \subsection{Learning To Act} Used if there is no state-transition model or cost/reward func. - -\bi{Markov Decision Process} diff --git a/electives/amr/parts/05_planning-control/02_learning-to-act/01_mdp.tex b/electives/amr/parts/05_planning-control/02_learning-to-act/01_mdp.tex index e69de29..bd61ac8 100644 --- a/electives/amr/parts/05_planning-control/02_learning-to-act/01_mdp.tex +++ b/electives/amr/parts/05_planning-control/02_learning-to-act/01_mdp.tex @@ -0,0 +1,31 @@ +\subsubsection{Markov Decision Process} +The goal is to maximize the reward. Along the route, get small reward, +at the end large reward (good or bad). + +Def. by states $\vec{x} \in \cX$ (RL: $s$), actions $\vec{u} \in \cU$ (RL: $a$), prob. state trans. $\cT(\vec{x}, \vec{u}, \vec{x}_+) = \P(\vec{x}_+ \divider \vec{x}, \vec{u})$, +reward func $\cR(\vec{x}, \vec{u}, \vec{x}_+)$, start state $\vec{x}_0$, optional terminal state $\vec{x}_N$. + +\bi{Utility Func} Expected reward: $V = \sum_{k = 0}^{N} r_k$ or \textit{discounted} reward $V = \sum_{k = 0}^{\8} \gamma^k r_k$ with $\gamma < 1$ + +\bi{Solving} (Val iter) $V_0(\vec{x}) = 0$ and $V_{i + 1}(\vec{x}) = \max_{\vec{u}} Q(\vec{x}, \vec{u})$ with +\[ + Q(\vec{x}, \vec{u}) = \sum_{\vec{x}_+} \P(\vec{x}_+ \divider \vec{x}, \vec{u}) [\cR(\vec{x}, \vec{u}, \vec{x}^+) + \gamma V_i(\vec{x}_+)] +\] +Repeat until conv. to $V^*$ ($\tco{|\cU||\cX|^2}$ per iter). Optimal policy: +\[ + \vec{\pi}^*(\vec{x}) = \text{argmax}_{\vec{u}} Q(\vec{x}, \vec{u}) +\] + +Using policy iter: +\begin{algorithm} + \begin{algorithmic}[1] + \State Choose $\vec{\pi}_0(\vec{x})$ + \While{\textit{policy} has not converged} + \Repeat $V_{i + 1}^{\vec{\pi}_j}(\vec{x}) = Q(\vec{x}, \vec{\pi}(\vec{x}))$ $\forall \vec{x}$ and \textit{fixed} pol. $\vec{\pi}_j$ + \Until{values converge} + \EndWhile + \State One step: $\vec{\pi}_{j + 1}(\vec{x}) = \text{argmax}_{\vec{u}} Q(\vec{x}. \vec{u})$ with $V_i = V_{i + 1}^{\pi_j}$ + \end{algorithmic} +\end{algorithm} + +Model-based learning uses empirical models of $\cT$ and $\cR$ diff --git a/electives/amr/parts/05_planning-control/02_learning-to-act/02_reinforcement-learning.tex b/electives/amr/parts/05_planning-control/02_learning-to-act/02_reinforcement-learning.tex index e69de29..959f6cc 100644 --- a/electives/amr/parts/05_planning-control/02_learning-to-act/02_reinforcement-learning.tex +++ b/electives/amr/parts/05_planning-control/02_learning-to-act/02_reinforcement-learning.tex @@ -0,0 +1,28 @@ +\subsubsection{Reinforcement Learning} +\bi{Passive} \textit{Direct Evaluation} Act according to policy $\vec{\pi}$, store sum of discounted rewards, average them. (But too simple) + +\textit{Sample-Based} Use $V_{i + 1}^{\vec{\pi}}(\vec{x}) = Q(\vec{x}, \vec{\pi}(\vec{x}))$ w/ $V_0^\pi(\vec{x}) = 0$. +We need state trans. model, instead $\tilde{R}_j$ (approx. prob. w/ statistics) and thus +\[ + V_{i + 1}^{\vec{x}}(\vec{x}) = \frac{1}{N} \sum_{j = 1}^{N} \tilde{R}_j(\vec{x}, \vec{\pi}(\vec{x}), \vec{x}_+) + \gamma V_i^{\vec{\pi}}(\vec{x}_+) +\] + +\bi{Active} Find optimal policy $\vec{\pi}$ instead of state values $V(\vec{x})$. + +\textit{Q-Learn.} Here: restate Val. Iter in $Q$-Values ($Q_0(\vec{x}, \vec{u}) = 0$): +\[ + Q_{i + 1}(\vec{x}, \vec{u}) = Q(\vec{x}, \vec{u}) \quad \text{with } V_i(\vec{x}_+) = \max_{\vec{u}}(\vec{Q}_i(\vec{x}_+, \vec{u})) +\] +Compute using sample: +\[ + \tilde{Q}_i(\vec{x}, \vec{u}) = \tilde{R}_i(\vec{x}, \vec{u}, \vec{x}_+) + \gamma \max_{\vec{u}}(\vec{Q}_i(\vec{x}_+, \vec{u})) +\] +Then update: $Q_{i + 1}(\vec{x}, \vec{u}) = (1 - \alpha)Q_i(\vec{x}, \vec{u}) + \alpha \tilde{Q}_i(\vec{x}, \vec{u})$. +Called off-policy learning, needs exploration. Simplest is random actions ($\varepsilon$-greedy): +$\varepsilon$ is prob. to act randomly, $1 - \varepsilon$ is prob. to act on pol. {\color{red} Space explored, still doing random stuff} + +\bi{Approaches} +\textit{Model-based} (estimate trans. model, e.g. Dyna), +Value-based (estimate val or $Q$-func and extract pol., e.g. Q-Learn), +Actor-Critic (estim. val or $Q$ of curr. pol., improve pol., e.g. A3C, SAC), +Policy-Gradient (diff. expect. reward w.r.t. params of policy network, e.g. REINFORCE)