diff --git a/electives/amr/autonomous-mobile-robots-cheatsheet.pdf b/electives/amr/autonomous-mobile-robots-cheatsheet.pdf
index e1b3718..182d5f2 100644
Binary files a/electives/amr/autonomous-mobile-robots-cheatsheet.pdf and b/electives/amr/autonomous-mobile-robots-cheatsheet.pdf differ
diff --git a/electives/amr/parts/05_planning-control/01_motion-planning-exploration/02_rrt.tex b/electives/amr/parts/05_planning-control/01_motion-planning-exploration/02_rrt.tex
index ab4f9fe..1d85960 100644
--- a/electives/amr/parts/05_planning-control/01_motion-planning-exploration/02_rrt.tex
+++ b/electives/amr/parts/05_planning-control/01_motion-planning-exploration/02_rrt.tex
@@ -26,7 +26,7 @@
         \EndProcedure
     \end{algorithmic}
 \end{algorithm}
-Returns a collision-free path as grapha. Need nearest neighbour search.
+Returns a collision-free path as graph. Need nearest neighbour search.
 Extension to RRT* to make path better (because this is quite bad)
 \begin{algorithm}
     \begin{algorithmic}[1]
diff --git a/electives/amr/parts/05_planning-control/02_learning-to-act/00_intro.tex b/electives/amr/parts/05_planning-control/02_learning-to-act/00_intro.tex
index b314db0..f2d999a 100644
--- a/electives/amr/parts/05_planning-control/02_learning-to-act/00_intro.tex
+++ b/electives/amr/parts/05_planning-control/02_learning-to-act/00_intro.tex
@@ -1,4 +1,2 @@
 \subsection{Learning To Act}
 Used if there is no state-transition model or cost/reward func.
-
-\bi{Markov Decision Process}
diff --git a/electives/amr/parts/05_planning-control/02_learning-to-act/01_mdp.tex b/electives/amr/parts/05_planning-control/02_learning-to-act/01_mdp.tex
index e69de29..bd61ac8 100644
--- a/electives/amr/parts/05_planning-control/02_learning-to-act/01_mdp.tex
+++ b/electives/amr/parts/05_planning-control/02_learning-to-act/01_mdp.tex
@@ -0,0 +1,31 @@
+\subsubsection{Markov Decision Process}
+The goal is to maximize the reward. Along the route, get small reward,
+at the end large reward (good or bad).
+
+Def. by states $\vec{x} \in \cX$ (RL: $s$), actions $\vec{u} \in \cU$ (RL: $a$), prob. state trans. $\cT(\vec{x}, \vec{u}, \vec{x}_+) = \P(\vec{x}_+ \divider \vec{x}, \vec{u})$,
+reward func $\cR(\vec{x}, \vec{u}, \vec{x}_+)$, start state $\vec{x}_0$, optional terminal state $\vec{x}_N$.
+
+\bi{Utility Func} Expected reward: $V = \sum_{k = 0}^{N} r_k$ or \textit{discounted} reward  $V = \sum_{k = 0}^{\8} \gamma^k r_k$ with $\gamma < 1$
+
+\bi{Solving} (Val iter) $V_0(\vec{x}) = 0$ and $V_{i + 1}(\vec{x}) = \max_{\vec{u}} Q(\vec{x}, \vec{u})$ with
+\[
+    Q(\vec{x}, \vec{u}) = \sum_{\vec{x}_+} \P(\vec{x}_+ \divider \vec{x}, \vec{u}) [\cR(\vec{x}, \vec{u}, \vec{x}^+) + \gamma V_i(\vec{x}_+)]
+\]
+Repeat until conv. to $V^*$ ($\tco{|\cU||\cX|^2}$ per iter). Optimal policy:
+\[
+    \vec{\pi}^*(\vec{x}) = \text{argmax}_{\vec{u}} Q(\vec{x}, \vec{u})
+\]
+
+Using policy iter:
+\begin{algorithm}
+    \begin{algorithmic}[1]
+        \State Choose $\vec{\pi}_0(\vec{x})$
+        \While{\textit{policy} has not converged}
+        \Repeat $V_{i + 1}^{\vec{\pi}_j}(\vec{x}) = Q(\vec{x}, \vec{\pi}(\vec{x}))$ $\forall \vec{x}$ and \textit{fixed} pol. $\vec{\pi}_j$
+            \Until{values converge}
+        \EndWhile
+        \State One step: $\vec{\pi}_{j + 1}(\vec{x}) = \text{argmax}_{\vec{u}} Q(\vec{x}. \vec{u})$ with $V_i = V_{i + 1}^{\pi_j}$
+    \end{algorithmic}
+\end{algorithm}
+
+Model-based learning uses empirical models of $\cT$ and $\cR$
diff --git a/electives/amr/parts/05_planning-control/02_learning-to-act/02_reinforcement-learning.tex b/electives/amr/parts/05_planning-control/02_learning-to-act/02_reinforcement-learning.tex
index e69de29..959f6cc 100644
--- a/electives/amr/parts/05_planning-control/02_learning-to-act/02_reinforcement-learning.tex
+++ b/electives/amr/parts/05_planning-control/02_learning-to-act/02_reinforcement-learning.tex
@@ -0,0 +1,28 @@
+\subsubsection{Reinforcement Learning}
+\bi{Passive} \textit{Direct Evaluation} Act according to policy $\vec{\pi}$, store sum of discounted rewards, average them. (But too simple)
+
+\textit{Sample-Based} Use $V_{i + 1}^{\vec{\pi}}(\vec{x}) = Q(\vec{x}, \vec{\pi}(\vec{x}))$ w/ $V_0^\pi(\vec{x}) = 0$.
+We need state trans. model, instead $\tilde{R}_j$ (approx. prob. w/ statistics) and thus
+\[
+    V_{i + 1}^{\vec{x}}(\vec{x}) = \frac{1}{N} \sum_{j = 1}^{N} \tilde{R}_j(\vec{x}, \vec{\pi}(\vec{x}), \vec{x}_+) + \gamma V_i^{\vec{\pi}}(\vec{x}_+)
+\]
+
+\bi{Active} Find optimal policy $\vec{\pi}$ instead of state values $V(\vec{x})$.
+
+\textit{Q-Learn.} Here: restate Val. Iter in $Q$-Values ($Q_0(\vec{x}, \vec{u}) = 0$):
+\[
+    Q_{i + 1}(\vec{x}, \vec{u}) = Q(\vec{x}, \vec{u}) \quad \text{with } V_i(\vec{x}_+) = \max_{\vec{u}}(\vec{Q}_i(\vec{x}_+, \vec{u}))
+\]
+Compute using sample:
+\[
+    \tilde{Q}_i(\vec{x}, \vec{u}) = \tilde{R}_i(\vec{x}, \vec{u}, \vec{x}_+) + \gamma \max_{\vec{u}}(\vec{Q}_i(\vec{x}_+, \vec{u}))
+\]
+Then update: $Q_{i + 1}(\vec{x}, \vec{u}) = (1 - \alpha)Q_i(\vec{x}, \vec{u}) + \alpha \tilde{Q}_i(\vec{x}, \vec{u})$.
+Called off-policy learning, needs exploration. Simplest is random actions ($\varepsilon$-greedy):
+$\varepsilon$ is prob. to act randomly, $1 - \varepsilon$ is prob. to act on pol. {\color{red} Space explored, still doing random stuff}
+
+\bi{Approaches} 
+\textit{Model-based} (estimate trans. model, e.g. Dyna),
+Value-based (estimate val or $Q$-func and extract pol., e.g. Q-Learn),
+Actor-Critic (estim. val or $Q$ of curr. pol., improve pol., e.g. A3C, SAC),
+Policy-Gradient (diff. expect. reward w.r.t. params of policy network, e.g. REINFORCE)