#!/usr/bin/env python
# coding: utf-8

# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import Image


# Chapter 7 n-step Bootstrapping
# =================
# 
# MC <-- n-step TD --> one-step TD

# In[3]:


Image('./res/fig7_1.png')


# Monte Carlo return:
# 
# $G_t \doteq R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots + \gamma^{T-t-1} R_T$
# 
# one-step return:
# 
# $G_{t:t+1} \doteq R_{t+1} + \gamma V_t(S_{t+1})$
# 
# two-step return:
# 
# $G_{t:t+2} \doteq R_{t+1} + \gamma R_{t+2} + \gamma^2 V_{t+1}(S_{t+2})$
# 
# n-step return:
# 
# $G_{t:t+n} \doteq R_{t+1} + \gamma R_{t+2} + \cdots + \gamma^{n-1} R_{t+n} + \gamma^n V_{t+n-1}(S_{t+n})$
# 
# The natural state-value learning algorithm for using n-step returns is thus:
# 
# $V_{t+n}(S_t) \doteq V_{t+n-1}(S_t) + \alpha \left [ G_{t:t+n} - V_{t+n-1}(S_t) \right ]$

# In[4]:


Image('./res/n_step_predict.png')


# error reduction property of n-step returns: their expectation is guaranteed to be a better estimate of $v_\pi$ than $V_{t+n-1}$ is, in a worst-state sense:
# 
# $\max_s \left | \mathbb{E}_\pi [G_{t:t+n} \mid S_t = s] - v_\pi(s) \right | \leq \gamma^n \max_s \left | V_{t+n-1}(s) - v_\pi(s) \right |$

# ### 7.2 n-step Sarsa
# 
# The main idea is to simply switch states for actions (state-action pairs) and then use an $\epsilon$-greedy policy.
# 
# $Q_{t+n}(S_t, A_t) \doteq Q_{t+n-1}(S_t, A_t) + \alpha [G_{t:t+n} - Q_{t+n-1}(S_t, A_t)] \quad \text{ , $0 \leq t < T$}$

# In[5]:


Image('./res/fig7_3.png')


# Expected Sarsa (like n-step sarsa):
# 
# \begin{align}
#     G_{t:t+n} & \doteq R_{t+1} + \cdots + \gamma^{n-1} R_{t+n} + \gamma^n \bar{V}_{t+n-1}(S_{t+n}) \qquad t + n < T \\
#     \bar{V}_t(s) & \doteq \sum_a \pi(a \mid s) Q_t(s, a) \qquad \text{for all $s \in \mathcal{S}$}
# \end{align}

# ### 7.3 n-step Off-policy Learning by Importance Sampling
# 
# simply be weighted by $\rho_{t:t+n-1}$, importance sampling ratio:
# 
# \begin{align}
#     V_{t+n}(S_t) & \doteq V_{t+n-1}(S_t) + \alpha \rho_{t:t+n-1} [G_{t:t+n} - V_{t+n-1}(S_t)] \qquad 0 \leq t < T \\
#     \rho_{t:h} & \doteq \prod_{k=t}^{\min(h, T-1)} \frac{\pi(A_k \mid S_k)}{b(A_k \mid S_k)} \\
#     Q_{t+n}(S_t, A_t) & \doteq Q_{t+n-1}(S_t, A_t) + \alpha \rho_{t+1:t+n-1} [G_{t:t+n} - Q_{t+n-1}(S_t, A_t)] \quad \text{ , $0 \leq t < T$}
# \end{align}

# ### 7.4 Per-decision Off-policy Methods with Control Variates

# ### 7.5 Off-policy Learning Without Importance Sampling: The n-step Tree Backup Algorithm
# 
# Tree-backup algorithm:
# 
# In the tree-backup update, the target includes all these things plus the estimated values of the dangling action nodes hanging off the sides, at all levels.

# In[6]:


Image('./res/tree_update.png')


# ### 7.6 A Unifying Algorithm: n-step $Q(\sigma)$
# 
# dicide on a step-by-step basis whether one wanted to take the action as a sample (as in Sarsa), or consider the expectation over all actions instead (as in the tree-backup update).
# 
# Let $\sigma(t) \in [0, 1]$ denote the degree of sampling on step $t$, with $\sigma = 1$ denoting full sampling and $\sigma = 0$ denoting a pure expectation with no sampling. We call this proposed new algorithm n-step $Q(\sigma)$.

# In[7]:


Image('./res/fig7_5.png')


# In[ ]: