#!/usr/bin/env python # coding: utf-8 # # DiscreteDP # # ***Implementation Details*** # **Daisuke Oyama** # *Faculty of Economics, University of Tokyo* # This notebook describes the implementation details of the `DiscreteDP` class. # For the theoretical background and notation, # see the lecture [Discrete Dynamic Programming](http://quant-econ.net/py/discrete_dp.html). # ## Solution methods # The `DiscreteDP` class currently implements the following solution algorithms: # # * value iteration; # * policy iteration (default); # * modified policy iteration. # # Policy iteration computes an exact optimal policy in finitely many iterations, # while value iteration and modified policy iteration return an $\varepsilon$-optimal policy # for a prespecified value of $\varepsilon$. # # Value iteration relies on (only) the fact that # the Bellman operator $T$ is a contraction mapping # and thus iterative application of $T$ to any initial function $v^0$ # converges to its unique fixed point $v^*$. # # Policy iteration more closely exploits the particular structure of the problem, # where each iteration consists of a policy evaluation step, # which computes the value $v_{\sigma}$ of a policy $\sigma$ # by solving the linear equation $v = T_{\sigma} v$, # and a policy improvement step, which computes a $v_{\sigma}$-greedy policy. # # Modified policy iteration replaces the policy evaluation step # in policy iteration with "partial policy evaluation", # which computes an approximation of the value of a policy $\sigma$ # by iterating $T_{\sigma}$ for a specified number of times. # Below we describe our implementation of these algorithms more in detail. # (While not explicit, in the actual implementation each algorithm is terminated # when the number of iterations reaches `iter_max`.) # ### Value iteration # `DiscreteDP.value_iteration(v_init, epsilon, iter_max)` # # 1. Choose any $v^0 \in \mathbb{R}^n$, and # specify $\varepsilon > 0$; set $i = 0$. # 2. Compute $v^{i+1} = T v^i$. # 3. If $\lVert v^{i+1} - v^i\rVert < [(1 - \beta) / (2\beta)] \varepsilon$, # then go to step 4; # otherwise, set $i = i + 1$ and go to step 2. # 4. Compute a $v^{i+1}$-greedy policy $\sigma$, and return $v^{i+1}$ and $\sigma$. # Given $\varepsilon > 0$, # the value iteration algorithm terminates in a finite number of iterations, # and returns an $\varepsilon/2$-approximation of the optimal value funciton and # an $\varepsilon$-optimal policy function # (unless `iter_max` is reached). # ### Policy iteration # `DiscreteDP.policy_iteration(v_init, iter_max)` # # 1. Choose any $v^0 \in \mathbb{R}^n$ and compute a $v^0$-greedy policy $\sigma^0$; # set $i = 0$. # 2. [Policy evaluation] # Compute the value $v_{\sigma^i}$ by solving the equation $v = T_{\sigma^i} v$. # 3. [Policy improvement] # Compute a $v_{\sigma^i}$-greedy policy $\sigma^{i+1}$; # let $\sigma^{i+1} = \sigma^i$ if possible. # 4. If $\sigma^{i+1} = \sigma^i$, # then return $v_{\sigma^i}$ and $\sigma^{i+1}$; # otherwise, set $i = i + 1$ and go to step 2. # The policy iteration algorithm terminates in a finite number of iterations, and # returns an optimal value function and an optimal policy function # (unless `iter_max` is reached). # ### Modified policy iteration # `DiscreteDP.modified_policy_iteration(v_init, epsilon, iter_max, k)` # # 1. Choose any $v^0 \in \mathbb{R}^n$, and # specify $\varepsilon > 0$ and $k \geq 0$; # set $i = 0$. # 2. [Policy improvement] # Compute a $v^i$-greedy policy $\sigma^{i+1}$; # let $\sigma^{i+1} = \sigma^i$ if possible (for $i \geq 1$). # 3. Compute $u = T v^i$ ($= T_{\sigma^{i+1}} v^i$). # If $\mathrm{span}(u - v^i) < [(1 - \beta) / \beta] \varepsilon$, then go to step 5; # otherwise go to step 4. # 4. [Partial policy evaluation] # Compute $v^{i+1} = (T_{\sigma^{i+1}})^k u$ ($= (T_{\sigma^{i+1}})^{k+1} v^i$). # Set $i = i + 1$ and go to step 2. # 5. Return # $v = u + [\beta / (1 - \beta)] [(\min(u - v^i) + \max(u - v^i)) / 2] \mathbf{1}$ # and $\sigma_{i+1}$. # Given $\varepsilon > 0$, # provided that $v^0$ is such that $T v^0 \geq v^0$, # the modified policy iteration algorithm terminates in a finite number of iterations, # and returns an $\varepsilon/2$-approximation of the optimal value funciton and # an $\varepsilon$-optimal policy function # (unless `iter_max` is reached). # *Remarks* # # * Here we employ the termination criterion based on the *span semi-norm*, # where $\mathrm{span}(z) = \max(z) - \min(z)$ for $z \in \mathbb{R}^n$. # Since $\mathrm{span}(T v - v) \leq 2\lVert T v - v\rVert$, # this reaches $\varepsilon$-optimality faster than the norm-based criterion # as employed in the value iteration above. # * Except for the termination criterion, # modified policy is equivalent to value iteration if $k = 0$ and # to policy iteration in the limit as $k \to \infty$. # * Thus, if one would like to have value iteration with the span-based rule, # run modified policy iteration with $k = 0$. # * In returning a value function, our implementation is slightly different from # that by Puterman (2005), Section 6.6.3, pp.201-202, which uses # $u + [\beta / (1 - \beta)] \min(u - v^i) \mathbf{1}$. # * The condition for convergence, $T v^0 \geq v^0$, is satisfied # for example when $v^0 = v_{\sigma}$ for some policy $\sigma$, # or when $v^0(s) = \min_{(s', a)} r(s', a)$ for all $s$. # If `v_init` is not specified, it is set to the latter, $\min_{(s', a)} r(s', a))$. # ## Illustration # We illustrate the algorithms above # by the simple example from Puterman (2005), Section 3.1, pp.33-35. # In[1]: import numpy as np import pandas as pd from quantecon.markov import DiscreteDP # In[2]: n = 2 # Number of states m = 2 # Number of actions # Reward array R = [[5, 10], [-1, -float('inf')]] # Transition probability array Q = [[(0.5, 0.5), (0, 1)], [(0, 1), (0.5, 0.5)]] # Probabilities in Q[1, 1] are arbitrary # Discount rate beta = 0.95 ddp = DiscreteDP(R, Q, beta) # Analytical solution: # In[3]: def sigma_star(beta): sigma = np.empty(2, dtype=int) sigma[1] = 0 if beta > 10/11: sigma[0] = 0 else: sigma[0] = 1 return sigma def v_star(beta): v = np.empty(2) v[1] = -1 / (1 - beta) if beta > 10/11: v[0] = (5 - 5.5*beta) / ((1 - 0.5*beta) * (1 - beta)) else: v[0] = (10 - 11*beta) / (1 - beta) return v # In[4]: sigma_star(beta=beta) # In[5]: v_star(beta=beta) # ### Value iteration # Solve the problem by value iteration; # see Example 6.3.1, p.164 in Puterman (2005). # In[6]: epsilon = 1e-2 v_init = [0, 0] res_vi = ddp.solve(method='value_iteration', v_init=v_init, epsilon=epsilon) # The number of iterations required to satisfy the termination criterion: # In[7]: res_vi.num_iter # The returned value function: # In[8]: res_vi.v # It is indeed an $\varepsilon/2$-approximation of $v^*$: # In[9]: np.abs(res_vi.v - v_star(beta=beta)).max() < epsilon/2 # The returned policy function: # In[10]: res_vi.sigma # Value iteration converges very slowly. # Let us replicate Table 6.3.1 on p.165: # In[11]: num_reps = 164 values = np.empty((num_reps, n)) diffs = np.empty(num_reps) spans = np.empty(num_reps) v = np.array([0, 0]) values[0] = v diffs[0] = np.nan spans[0] = np.nan for i in range(1, num_reps): v_new = ddp.bellman_operator(v) values[i] = v_new diffs[i] = np.abs(v_new - v).max() spans[i] = (v_new - v).max() - (v_new - v).min() v = v_new # In[12]: df = pd.DataFrame() df[0], df[1], df[2], df[3] = values[:, 0], values[:, 1], diffs, spans df.columns = '$v^i(0)$', '$v^i(1)$', \ '$\\lVert v^i - v^{i-1}\\rVert$', '$\\mathrm{span}(v^i - v^{i-1})$' iter_nums = pd.Series(list(range(num_reps)), name='$i$') df.index = iter_nums display_nums = \ list(range(10)) + [10*i for i in range(1, 16)] + [160+i for i in range(4)] df.iloc[display_nums, [0, 1, 2]] # On the other hand, the span decreases faster than the norm; # the following replicates Table 6.6.1, page 205: # In[13]: df.iloc[list(range(1, 13)) + [10*i for i in range(2, 7)], [2, 3]] # The span-based termination criterion is satisfied when $i = 11$: # In[14]: epsilon * (1-beta) / beta # In[15]: spans[11] < epsilon * (1-beta) / beta # In fact, modified policy iteration with $k = 0$ terminates with $11$ iterations: # In[16]: epsilon = 1e-2 v_init = [0, 0] k = 0 res_mpi_1 = ddp.solve(method='modified_policy_iteration', v_init=v_init, epsilon=epsilon, k=k) # In[17]: res_mpi_1.num_iter # In[18]: res_mpi_1.v # ### Policy iteration # If $\{\sigma^i\}$ is the sequence of policies obtained by policy iteration # with an initial policy $\sigma^0$, # one can show that $T^i v_{\sigma^0} \leq v_{\sigma^i}$ ($\leq v^*$), # so that the number of iterations required for policy iteration is smaller than # that for value iteration at least weakly, # and indeed in many cases, the former is significantly smaller than the latter. # In[19]: v_init = [0, 0] res_pi = ddp.solve(method='policy_iteration', v_init=v_init) # In[20]: res_pi.num_iter # Policy iteration returns the exact optimal value function (up to rounding errors): # In[21]: res_pi.v # In[22]: np.abs(res_pi.v - v_star(beta=beta)).max() # To look into the iterations: # In[23]: v = np.array([0, 0]) sigma = np.array([-1, -1]) # Dummy sigma_new = ddp.compute_greedy(v) i = 0 while True: print('Iterate {0}'.format(i)) print(' value: {0}'.format(v)) print(' policy: {0}'.format(sigma_new)) if np.array_equal(sigma_new, sigma): break sigma[:] = sigma_new v = ddp.evaluate_policy(sigma) sigma_new = ddp.compute_greedy(v) i += 1 print('Terminated') # See Example 6.4.1, pp.176-177. # ### Modified policy iteration # The evaluation step in policy iteration # which solves the linear equation $v = T_{\sigma} v$ # to obtain the policy value $v_{\sigma}$ # can be expensive for problems with a large number of states. # Modified policy iteration is to reduce the cost of this step # by using an approximation of $v_{\sigma}$ obtained by iteration of $T_{\sigma}$. # The tradeoff is that this approach only computes an $\varepsilon$-optimal policy, # and for small $\varepsilon$, takes a larger number of iterations than policy iteration # (but much smaller than value iteration). # In[24]: epsilon = 1e-2 v_init = [0, 0] k = 6 res_mpi = ddp.solve(method='modified_policy_iteration', v_init=v_init, epsilon=epsilon, k=k) # In[25]: res_mpi.num_iter # The returned value function: # In[26]: res_mpi.v # It is indeed an $\varepsilon/2$-approximation of $v^*$: # In[27]: np.abs(res_mpi.v - v_star(beta=beta)).max() < epsilon/2 # To look into the iterations: # In[28]: epsilon = 1e-2 v = np.array([0, 0]) k = 6 i = 0 print('Iterate {0}'.format(i)) print(' v: {0}'.format(v)) sigma = np.empty(n, dtype=int) # Store the policy function while True: i += 1 u = ddp.bellman_operator(v, sigma=sigma) diff = u - v span = diff.max() - diff.min() print('Iterate {0}'.format(i)) print(' sigma: {0}'.format(sigma)) print(' T_sigma(v): {0}'.format(u)) print(' span: {0}'.format(span)) if span < epsilon * (1-ddp.beta) / ddp.beta: v = u + ((diff.max() + diff.min()) / 2) * \ (ddp.beta / (1 - ddp.beta)) break ddp.operator_iteration(ddp.T_sigma(sigma), v=u, max_iter=k) v = u print(' T_sigma^k+1(v): {0}'.format(v)) print('Terminated') print(' sigma: {0}'.format(sigma)) print(' v: {0}'.format(v)) # Compare this with the implementation with the norm-based termination rule # as described in Example 6.5.1, pp.187-188. # ## Reference # * M.L. Puterman, # [*Markov Decision Processes: Discrete Stochastic Dynamic Programming*](http://onlinelibrary.wiley.com/book/10.1002/9780470316887), # Wiley-Interscience, 2005. # In[ ]: