#!/usr/bin/env python
# coding: utf-8

# In[1]:


# %load /Users/facai/Study/book_notes/preconfig.py
get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
#sns.set(font='SimHei', font_scale=2.5)
#plt.rcParams['axes.grid'] = False

import numpy as np

import pandas as pd
#pd.options.display.max_rows = 20

#import sklearn

#import itertools

#import logging
#logger = logging.getLogger()

#from IPython.display import SVG
from IPython.display import Image


# Chapter 3: Finite Markov Decision Processes 
# ==========
# 
# MDP(Markov Decision Processes): actions influence not just immediate rewards, but also subsequential situations.

# ### 3.1 The Agent-Environment Interface

# In[3]:


Image('./res/fig3_1.png')


# *finite* MDP: the sets of states, actions and rewards all have a finite number of elements.
# 
# \begin{align}
#     & p(s', r \mid s, a) \doteq \operatorname{Pr} \{ S_t = s', R_t = r \mid S_{t-1} = s, A_{t-1} = a \} \\
#     & \displaystyle \sum_{s' \in \mathcal{S}} \sum_{r \in \mathcal{R}} p(s', r \mid s, a) = 1 \quad \text{, for all $s \in \mathcal{S}$, $a \in \mathcal{A}(s)$}
# \end{align}
# 
# 
# + state-transition probabilities:
# 
# \begin{align*}
#     p(s' \mid s, a) & \doteq \operatorname{Pr} \{ S_t = s' \mid S_{t-1} = s, A_{t-1} = a \} \\
#                     & = \sum_{r \in \mathcal{R}} p(s', r \mid s, a)
# \end{align*}
# 
# + expected rewards for state-action paris:
# 
# \begin{align*}
#     r(s, a) & \doteq \mathbb{E} \left [ R_t \mid S_{t-1} = s, A_{t-1} = a \right ] \\
#             & = \sum_{r \in \mathcal{R}} \left ( r \sum_{s' \in \mathcal{S}} p(s', r \mid s, a) \right )
# \end{align*}
# 
# + expected rewards for state-action-next-state triples:
# 
# \begin{align*}
#     r(s, a, s') & \doteq \mathbb{E} \left [ R_t \mid S_{t-1} = s, A_{t-1} = a, S_t = s' \right ] \\
#             & = \sum_{r \in \mathcal{R}} r \frac{p(s', r \mid s, a)}{p(s' \mid s, a)}
# \end{align*}
# 
# agent-environment boundary represents the limit of the agent's *absolute control*, not of its knowledge.

# In[5]:


# Transition Graph
Image('./res/ex3_3.png')


# ##### Exercise 3.4
# 
# $r(S_t, a) \; \pi(a \mid S_t)$
# 
# 

# ### 3.2 Goals and Rewards
# 
# goal: to maximize the total amount of reward it receives.
# 
# In particular, the reward signal is not the place to impart to the agent prior knowledge about *how* to achieve what it to do.

# ### 3.3 Returns and Episodes
# 
# + episodic tasks: $G_t \doteq R_{t+1} + R_{t+2} + R_{t+3} + \cdots + R_T$, $\quad G_t$ is *expected return*.
# + continuing tasks: $G_t \doteq R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots = \displaystyle \sum_{k=0}^\infty \gamma^k R_{t+k+1} = R_{t+1} + \gamma G_{t+1}$    
#   - $\gamma$ is called *discount rate*, and $0 \leq \gamma \leq 1$    
#   - $G_T = 0$ often makes it easy to compute returns from reward sequences.
#   
#   
# ##### exercise 3.8
# 
# 0 for escaping from the maze and -1 at all other times
# 
# ##### exercise 3.10
# 
# $G_1 = \frac{7}{1 - r} = 70$
# 
# $G_0 = R_1 + 0.9 G_1 = 65$
# 
# ##### exercise 3.11
# 
# \begin{align}
#     G_t &= \sum_{k=0}^\infty r^k \\
#         &= 1 + r \sum_{k=0}^\infty r^k \\
#         &= 1 + r G_t \\
#         &= \frac1{1 - r}
# \end{align}

# ### 3.4 Unified Notation for Episodic and Continuing Tasks
# 
# $G_t \doteq \sum_{k=t+1}^T \gamma^{k-t-1} R_k$, including the possibility that $T = \infty$ or $\gamma = 1$.

# ### 3.5 Policies and Value Functions
# 
# + value functions: estimate *how good* it is for the agent to be in a given state.
# + policy $\pi$: a mapping from states to probabilities of selecting each possible action.
# + The *value* of a state $s$ under a policy $\pi$, denoted $v_\pi(s)$, is the expected return when starting in $s$ and following $\pi$ thereafter:     
#   $v_\pi(s) \doteq \mathbb{E}_\pi [ G_t \mid S_t = s ]$
#   - $v_\pi$: state-value function for policy $\pi$
# + $q_\pi$: action-value function for policy $\pi$, $q_\pi(s, a) \doteq \mathbb{E}_\pi [ G_t \mid S_t = s, A_t = a]$
# 
# Bellman equation for $v_\pi$:
# 
# \begin{equation}
#     v_\pi(s) \doteq \displaystyle \sum_a \pi(a \mid s) \sum_{s', r} p(s', r \mid s, a) \left [ r + \gamma v_\pi(s') \right ] \quad \text{, for all $s \in \mathcal{S}$}
# \end{equation}
# 
# The value functions $v_\pi$ and $q_\pi$ can be estimated from experience, say Monte Carlo methods (average).

# In[31]:


# Example 3.5
from scipy.signal import convolve2d

reward_matrix = np.zeros((5, 5))
# kernel
kernel = np.array([[0, 1, 0],
                   [1, 0, 1],
                   [0, 1, 0]])

iteration_nums = 100

for _ in range(iteration_nums):
    reward = convolve2d(reward_matrix, kernel, mode='same', boundary='fill', fillvalue=-1)
    reward /= 4.0
    # A -> A'
    reward[0, 1] = 10 + reward[-1, 1]
    # B -> B'
    reward[0, -2] = 5 + reward[2, -2]
    reward_matrix = reward
 
pd.DataFrame(reward_matrix)


# ##### exercise 3.12
# 
# (2.3 + 0.4 - 0.4 + 0.7) / 4 = 0.75
# 
# 
# ##### exercise 3.13
# 
# 
# ##### exercise 3.14
# 
# $\sum_{k=0}^\infty \gamma^k C = \frac{C}{1 - r}$ = constant offset

# ### 3.6 Optimal Policies and Optimal Value Functions
# 
# optimal policy: 
# 
# \begin{align}
#     v_\ast(s) & \doteq \displaystyle \max_\pi v_\pi(s) \quad \text{ for all } s \in \mathcal{S} \\
#               & = \max_a \sum_{s', r} p(s', r \mid s, a) \left [ r + \gamma v_\ast(s') \right ] \\
# \end{align}
# 
# Any policy that is *greedy* with respect to the optimal evaluation function $v_\ast$ is an optimal policy.
# 
# optimal action-value function: 
# 
# \begin{align}
#     q_\ast(s, a) & \doteq \max_\pi q_\pi(s, a) \quad \text{ for all $s \in \mathcal{S}$ and $a \in \mathcal{A}(s)$} \\
#                  & = \mathbb{E} [ R_{t+1} + \gamma v_\ast(S_{t+1}) \mid S_t = s, A_t = a ] \\
#                  & = \sum_{s', r} p(s', r \mid s, a) \left [ r + \gamma \max_{a'} q_\ast (s', a') \right ] \\
# \end{align}
# 
# 
# Explicitly solving the Bellman optimality equation relies on at least three assumptions that are rarely true in practice:
# 
# 1. we accurately know the dynamics of the environment;
# 2. we have enough computational resources;
# 3. the Markov property.

# ### 3.7 Optimality and Approximation
# 
# extreme computational cost, memory => approximations
# 
# put more effort into learning to make good decisions for frequently encountered states, at the expense of less effort for infrequently encountered states.

# In[ ]: