#!/usr/bin/env python # coding: utf-8 # In[1]: # %load /Users/facai/Study/book_notes/preconfig.py get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd from IPython.display import Image # Chapter 6 Temporal-Difference Learning # ===================== # # DP, TD, and Monte Carlo methods all use some variation of generalized policy iteration: primarily differences in their approaches to the prediction problem. # ### 6.1 TD Prediction # # constant-$\alpha$ MC: $V(S_t) \gets V(S_t) + \alpha \underbrace{\left [ G_t - V(S_t) \right ]}_{= \sum_{k=t}^{T-1} \gamma^{k-1} \theta_k}$ # # # \begin{align} # v_\pi(s) &\doteq \mathbb{E}_\pi [ G_t \mid S_s = s] \qquad \text{Monte Carlo} \\ # &= \mathbb{E}_\pi [ R_{t+1} + \gamma \color{blue}{v_\pi(S_{t+1})} \mid S_t = s ] \quad \text{DP} # \end{align} # # one-step TD, or TD(0): $V(S_t) \gets V(S_t) + \alpha \left [ \underbrace{R_{t+1} + \gamma \color{blue}{V(S_{t+1})} - V(S_t)}_{\text{TD error: } \theta_t} \right ]$ # # TD: samples the expected values and uses the current estimate $V$ instead of the true $v_\pi$. # # In[3]: Image('./res/fig6_1.png') # In[2]: Image('./res/TD_0.png') # ### 6.2 Advantages of TD Prediction Methods # # TD: they learn a guess from a guess - they boostrap. # # + advantage: # 1. over DP: TD do not require a model of the environment, of its reward and next-state probability distributions. # 2. over Monte Carlo: TD are naturally implemented in an online, fully incremental fashion. # # TD: guarantee convergence. # # In practice, TD methods have usually been found that converge faster than constant-$\alpha$ MC methods on stochastic tasks. # ### 6.3 Optimality of TD(0) # # batch updating: updates are made only after processing each complete batch of training data until the value function converges. # # + Batch Monte Carlo methods: always find the estimates that minimize mean-squared error on the training set. # + Batch TD(0): always find the estimates that would be exactly correct for the maximum-likelihood model of the Markov process. # ### 6.4 Sarsa: On-policy TD Control # # $Q(S_t, A_t) \gets Q(S_t, A_t) + \alpha \left [ R_{t+1} + \gamma Q(S_{t+1}, A_{t+1}) - Q(S_t,, A_t) \right ]$ # In[3]: Image('./res/sarsa.png') # ### 6.5 Q-learning: Off-policy TD Control # # # $Q(S_t, A_t) \gets Q(S_t, A_t) + \alpha \left [ R_{t+1} + \gamma \color{blue}{\max_a Q(S_{t+1}, a)} - Q(S_t,, A_t) \right ]$ # In[4]: Image('./res/q_learn_off_policy.png') # ### 6.6 Expected Sarsa # # use expeteced value, how likely each action is under the current policy. # # # \begin{align} # Q(S_t, A_t) & \gets Q(S_t, A_t) + \alpha \left [ R_{t+1} + \gamma \color{blue}{\mathbb{E}[Q(S_{t+1}, A_{t+1}) \mid S_{t+1}]} - Q(S_t,, A_t) \right ] \\ # & \gets Q(S_t, A_t) + \alpha \left [ R_{t+1} + \gamma \color{blue}{\sum_a \pi(a \mid S_{t+1}) Q(S_{t+1}, a)} - Q(S_t,, A_t) \right ] # \end{align} # # + con: additional computational cost. # + pro: eliminate the variance due to the random seleciton of $A_{t+1}$. # ### 6.7 Maximization Bias and Double Learning # # maximization bias: # + a maximum over estimated values => an estimate of the maximum value => significant positive bias. # # root of problem: using the same samples (plays) both to determine the maximizing action and to estimate its value. => divide the plays in two sets ($Q_1, Q_2$) and use them to learn two indepedent estimates. (*double learning*) # # # $Q_1(S_t, A_t) \gets Q_1(S_t, A_t) + \alpha \left [ R_{t+1} + \gamma Q_2 \left( S_{t+1}, \operatorname{argmax}_a Q_1(S_{t+1}, a) \right) - Q_1(S_t,, A_t) \right ]$ # In[5]: Image('./res/double_learn.png') # ### 6.8 Games, Afterstates, and Other Special Cases # In[ ]: