#!/usr/bin/env python
# coding: utf-8

# In[1]:


# %load /Users/facai/Study/book_notes/preconfig.py
get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
#sns.set(font='SimHei', font_scale=2.5)
#plt.rcParams['axes.grid'] = False

import numpy as np

import pandas as pd
#pd.options.display.max_rows = 20

import logging
logging.basicConfig()
logger = logging.getLogger()

from IPython.display import Image

import enum


# Chapter 4 Dynamic Programming
# ================
# 
# limition:
# + assumption: a perfect model
# + great computational expense 

# ### 4.1 Plicy Evaluation (Prediction)

# In[2]:


Image('./res/iterative_policy_evaluation.png')


# In[3]:


Image('./res/ex4_1.png')


# In[4]:


class Action(enum.Enum):
    EAST = enum.auto()
    WEST = enum.auto()
    SOUTH = enum.auto()
    NORTH = enum.auto()
    
    @staticmethod
    def move(x, y, action):
        if action == Action.EAST:
            return x, y - 1
        elif action == Action.WEST:
            return x, y + 1
        elif action == Action.SOUTH:
            return x + 1, y
        elif action == Action.NORTH:
            return x - 1, y
            
            
class GridWorld(object):
    def move(self, s, action):
        if s == 0 or s == 15:
            return s, 0
        elif 0 < s < 15:
            x = s // 4
            y = s % 4
            x1, y1 = Action.move(x, y, action)
            if 0 <= x1 < 4 and 0 <= y1 < 4:
                s1 = x1 * 4 + y1
                return s1, -1
            else:
                return s, -1
        else:
            raise ValueError('s {} must be in [0, 15]'.format(s))
        
        
class RandomPolicy(object):
    
    def __init__(self, grid_world):
        self._grid_world = grid_world
        
        self._v = np.zeros((4, 4))
        self._v_flatten = self._v.ravel()
        
        self._delta = 0
    
    def iterate(self):
        v = self._v.copy()
        
        for s in range(0, 16):
            self.update_value(s)
        
        self._delta = max(self._delta, np.sum(np.abs(v - self._v)))
        
        return self._v.copy()
    
    def get_pi(self, s):
        return [(0.25, (s, a)) for a in [Action.EAST,
                                         Action.WEST,
                                         Action.SOUTH,
                                         Action.NORTH]]
    
    def update_value(self, s):
        # V(s) = \sum_a \pi(a | s) \sum 1 * (r + 1 * V(s1))
        vs = []
        for (prob, (s, a)) in self.get_pi(s):
            s1, r = self._grid_world.move(s, a)
            vs.append(prob * (r + self._v_flatten[s1]))
        logger.debug('vs: {}'.format(vs))
        self._v_flatten[s] = np.sum(vs)


# In[5]:


# logger.setLevel(logging.DEBUG)
r = RandomPolicy(GridWorld())
for _ in range(100):
    r.iterate()
pd.DataFrame(np.round(r.iterate()))


# ### 4.2 Policy Improvement
# 
# \begin{align*}
#     q_\pi(s, a) = \sum_{s', r} p(s', r \mid s, a) \left [ r + \gamma v_\pi(s') \right ]
# \end{align*}
# 
# policy improvement theorem: For all $s \in \mathcal{S}$, $q_\pi(s, \pi'(s)) \geq v_\pi(s)$, then $v_{\pi'}(s) \geq v_\pi(s)$.
# 
# => new greedy policy: $\pi'(s) = \operatorname{arg \, max}_a q_\pi(s, a)$, policy imporvement.
# 
# If there are ties in policy improvement step, each maximizing action can be given a portion of the probability of being selected in the new greedy policy.

# In[6]:


Image('./res/fig4_1.png')


# ### 4.3 Policy Iteration
# 
# policy iteration:
# 
# \begin{align*}
#     \pi_0 \overset{E}{\longrightarrow} v_{\pi_0} \overset{I}{\longrightarrow} \pi_1 \overset{E}{\longrightarrow} v_{\pi_1} \overset{I}{\longrightarrow} \pi_2 \overset{E}{\longrightarrow} \cdots \overset{I}{\longrightarrow} \pi_\ast \overset{E}{\longrightarrow} v_{\pi_\ast} 
# \end{align*}

# In[7]:


Image('./res/policy_iteration.png')


# In[8]:


Image('./res/fig4_2.png')


# ### 4.4 Value Iteration
# 
# truncate policy evaluation
# 
# => value iteration: policy evaluation is stopped after just one sweep (one update of each state).

# In[9]:


Image('./res/value_iteration.png')


# ### 4.5 Asynchronouse Dynamic Programming
# 
# avoiding sweeps
# 
# focus the DP algorithm's update onto parts of the state set that are most relevant to the agent.

# ### 4.6 Generalized Policy Iteration
# 
# Almost all reinforcement learning methods are well described as GPI:

# In[10]:


Image('./res/gpi.png')


# ### 4.7 Efficiency of Dynamic Programming
# 
# DP: polynomial in the number of states and actions.

# In[ ]: