#!/usr/bin/env python
# coding: utf-8
# # Foundations of Computational Economics #27
#
# by Fedor Iskhakov, ANU
#
#
# ## Dynamic programming in discrete world
#
#
#
#
# [https://youtu.be/kpNGDQnDpmU](https://youtu.be/kpNGDQnDpmU)
#
# Description: Backwards induction. Tiling problem. Deal or no deal game. Bellman optimality principle. Inventory dynamics model.
# ### What is dynamic programming?
#
# **“DP is recursive method for solving sequential decision problems”**
#
# 📖 Rust 2006, *New Palgrave Dictionary of Economics*
#
# In computer science the meaning of the term is broader:
# **DP is a general algorithm design technique for solving problems with
# overlapping sub-problems.**
#
# Generally allows solving in polynomial time $ O(n^k) $ problems that would
# otherwise take exponential time $ O(a^n) $
# #### Tiling with dominoes example
#
# Given a $ 3 \times n $ board, find **the number of ways** to
# fill it with $ 2 \times 1 $ dominoes.
# #### Examples of tiling
#
# These are three possible ways to fill up $ 3 \times 2 $ board
#
#
#
# This is one possible way to tile $ 3 \times 8 $ board
#
#
#
# The problem is to compute the number of possible tilings for any $ 3 \times n $ board.
# #### Breaking the big problem into subproblems
#
# Observe that at any possible stage of filling up a $ 3 \times n $ board we may have
# the following configurations
#
#
#
# And it is impossible to have the following configurations
#
#
# #### Defining the recursion
#
# The case of $ A_n $:
#
#
#
# The case of $ B_n $:
#
#
#
# The case of $ C_n $ is identical to $ B_n $, i.e. $ C_n = B_n $
# #### Recursive solution
#
# Therefore for any $ n $ we have
#
# $$
# \begin{eqnarray*}
# A_n &=& A_{n-2} + 2 B_{n-1} \\
# B_n &=& A_{n-1} + B_{n-2}
# \end{eqnarray*}
# $$
#
# The answer to the whole problem is given by $ A_n $
#
# 1. Inductive computation of the two sequences.
# 1. Can be improved by *memoization* (optimization technique based on caching previous calls to the function)
# In[1]:
def WaysTileDominoes(n):
'''Compute the number of ways to tile 3 x n area by 2x1 tiles'''
A, B = [0] * (n + 1),[0] * (n + 1)
A[0] = 1 # one way to tile 3x0
A[1] = 0 # no way to tile 3x1
B[0] = 0 # no way to tile 3x0 without a corner
B[1] = 1 # one way to tile 3x1 without a corner
for i in range(2, n+1): # loop over 2,3,..,n
A[i] = A[i-2] + 2 * B[i-1]
B[i] = A[i-1] + B[i-2]
return A[n]
# In[2]:
for n in range(1,20):
print('There are',WaysTileDominoes(n),'ways to tile the 3 by',n,'board')
# ### Deal or no deal example
#
# Consider a version of [Deal or no deal](https://en.wikipedia.org/wiki/Deal_or_No_Deal) TV show game
#
# - the player is presented with $ n $ boxes with prizes hidden inside
# - all prizes $ x_1,\dots,x_n $ are known to the player, but not where they are
# - at each round the player may choose a box at random to be removed from the game (and loose the opportunity to get the prize within)
# - otherwise, the player may choose to stop the game and walk away with the prize chosen randomly from remaining boxes
#
#
# What is the optimal strategy to maximize the (expected) reward?
# #### Need assumption about the reward evaluation
#
# - assume that the player is maximizing the expected reward
# - ok, but then for each set of remaining prizes $ (x,y,z) $ the expected reward is
#
#
# $$
# \frac{x+y+z}{3} \text{ (stopping the game)}
# $$
#
# $$
# \frac{1}{3}\cdot\frac{x+y}{2} + \frac{1}{3}\cdot\frac{x+z}{2} + \frac{1}{3}\cdot\frac{y+z}{2} \text{ (removing random box)}
# $$
#
# - at each point of the game the player will be indifferent between stopping or continuing the game
# - thus, optimal strategy is to stop at the first round and take random reward!
# #### Loss aversion utility
#
# - assume that the player has loss aversion utility of the form
#
#
# $$
# U(\text{reward},\text{reference}) = \text{reward} - \theta \cdot \mathbb{1}\{ \text{reward}< \text{reference}\}
# $$
#
# - if award is below the reference level, there is a cut in utility
# - assume further that the reference level is *updated* endogenously during the game:
# - at each round equal to the foregone option of stopping the game in the previous round
# #### How to solve for the optimal strategy in this game?
#
# 1. What is the maximum number of rounds in the game?
# 1. What does the optimal choice depend on in each round?
# 1. What is the *complete* strategy in the game?
# - for a given round let $ B $ denote the set of remaining $ n $ boxes/prizes
# - denote $ r $ the reference level in the utility function $ U(\cdot,r) $
#
#
# Let $ V(B,r) $ be the maximum expected reward, i.e. assuming optimal strategy is played from this round on
#
# $$
# V(B,r) = \max\Big[ \underbrace{\tfrac{1}{n} \sum_{b \in B} U(b,r)}_{\text{stop}} ;
# \underbrace{\sum_{b \in B} \tfrac{1}{n} V\big(B \backslash b, \tfrac{1}{n} \sum_{d \in B} U(d,r)\big) }_{\text{continue}}
# \Big]
# $$
#
# $$
# V(B,r) = \max\big[ R_r ; \sum_{b \in B} \tfrac{1}{n} V(B \backslash b, R_r) \big]
# $$
#
# - $ R_r = \tfrac{1}{n} \sum_{b \in B} U(b,r) $ is current round expected reward
# - $ B \backslash b $ is the next round set of $ n-1 $ boxes/prizes
# In[3]:
def expected_reward(boxes,ref=None,loss_aversion_param=.1,verbose=True):
'''Compute the expected reward from the game with given boxed prizes and the reference level'''
n = len(boxes) # number of boxed remaining
ref = sum(boxes)/n if ref is None else ref # default reference level
# reward if stopping
current = [b - loss_aversion_param*(b[1:
print(' reward if stop={:<6.3f} if continue={:<6.3f}'.format(current,next),end='') # rewards for two decisions
print(' >> {}!'.format('stop' if current >= next else 'continue')) # best decision
else:
print(' reward={:<6.3f}'.format(current)) # reward in case of last box left
return max(current,next) # reward from the optimal choice
# In[4]:
import math
boxes = [int(math.exp(b)) for b in range(3)] # uneven prizes
print('Initial prizes = ',boxes)
expected_reward(boxes,loss_aversion_param=0.1);
# #### Bellman’s Principle of Optimality
#
# “An optimal policy has a property that whatever the initial state and
# initial decision are, the remaining decisions must constitute an
# optimal policy with regard to the state resulting from the first
# decision.”
#
# 📖 Bellman, 1957 “Dynamic Programming”
# #### Breaking the problem into sequence of small problems
#
# - Thus, the sequential decision problem is broken into *initial decision*
# problem and the *future decisions* problem
# - The solution can be computed through **backward induction**,
# i.e. solving a sequential decision problem from the later periods
# - Embodiment of the recursive way of modeling sequential decisions is
# **Bellman equation**
# #### Bellman equation
#
# $$
# V(\text{state}) = \max_{\text{decisions}} \big[ U(\text{state},\text{decision}) + \beta \mathbb{E}\big\{ V(\text{next state}) \big| \text{state},\text{decision} \big\} \big]
# $$
#
# - $ V(\text{state}) $ is **value function** = maximum attainable (discounted) expected reward/utility/payoff
# - $ U(\text{state},\text{decision}) $ is per-period/flow/instantaneous reward/utility/payoff
# - (*next state*) is the *stochastic* next period state resulting from current state and decision
# - expectation $ \mathbb{E}\{\cdot\} $ is taken over the distribution of the next period state conditional on current state and decision
# - $ \beta $ is a discount factor to measure future rewards in terms of current ones
#
#
# The optimal choices are revealed along the solution of the Bellman equation as decisions which solve the maximization problem in the right hand side!
# #### Bellman equation in deterministic models
#
# In deterministic case, expectation is not necessary:
#
# $$
# V(\text{state}) = \max_{\text{decisions}} \big[ U(\text{state},\text{decision}) + \beta \cdot V(\text{next state}) \big]
# $$
# #### Bellman equation in models with finite horizon
#
# Additional condition at the final period $ t=T $, usually
#
# $$
# V(\text{state}) = \max_{\text{decisions}} \big[ U(\text{state},\text{decision}) \big] \text{ at terminal period } T
# $$
#
# In other words, as if $ V(\text{at } T +1 ) = \mathbb{0} $
# #### Power of dynamic programming
#
# DP is a the main tool in analyzing modern micro and macto economic models
#
# DP is powerful due to its **flexibility and breadth**
#
# DP provides a framework to study decision making over time and under uncertainty
# and can accommodate learning, strategic interactions between agents (game theory)
# and market interactions (equilibrium theory)
# #### Dynamic programming in economics
#
# Many important problems and economic models are analyzed and solved
# using dynamic programming:
#
# - Dynamic models of labor supply
# - Job search
# - Human capital accumulation
# - Health process, insurance and long term care
# - Consumption/savings choices
# - Durable consumption
# - Growth models
# - Heterogeneous agents models
# - Overlapping generation models
# #### Origin of the term Dynamic Programming
#
# 📖 Bellman’s autobiography “The Eye of the Hurricane”
#
# The 1950’s were not good years for mathematical research. We had a very interesting
# gentleman in Washington named Wilson. He was Secretary of Defence, and
# he actually had a pathological fear and hatred of the word “research”.
#
# I’m not using the term lightly; I’m using it precisely. His face would
# suffuse, he would turn red, and he would get violent if people used the
# term, research, in his presence. You can imagine how he felt, then,
# about the term, mathematical.
# Hence, I felt I had to do something to shield Wilson and the Air Force
# from the fact that I was really doing mathematics inside the RAND Corporation.
#
# What title, what name, could I choose?
#
# In the first place, I was interested in planning, in
# decision-making, in thinking. But planning, is not a good word for
# various reasons. I decided therefore to use the word, “programming”.
#
# I wanted to get across the idea that this was dynamic, this was
# multistage, this was time-varying.
# I thought, let’s kill two birds with one stone. Let’s take a word which has an absolutely precise
# meaning, namely dynamic, in the classical physical sense.
#
# It also has a very interesting property as an adjective, and that is it’s impossible
# to use the word, dynamic, in the pejorative sense.
#
# Thus, I thought dynamic programming was a good name. It was something not even a
# Congressman could object to. So I used it as an umbrella for my activities.
# ### Inventory dynamics problem
#
# Consider the following problem in discrete time and finite horizon $ t=0,\dots,T $
#
# The notation is:
#
# - $ x_t\ge 0 $ is inventory at period $ t $ measured in **discrete units**
# - $ d_t\ge 0 $ is *potentially stochastic* demand at period $ t $
# - $ q_t\ge 0 $ is the order of new inventory
# - $ p $ is the profit per one unit of (supplied) good
# - $ c $ is the fixed cost of ordering any amount of new inventory
# - $ r $ is the cost of storing one unit of good
# The sales in period $ t $ are given by $ s_t = \min\{x_t,d_t\} $.
#
# Inventory to be stored till next period is given by $ k_t = \max\{x_t-d_t,0\} + q_t = x_{t+1} $.
#
# The profit in period $ t $ is given by
#
# $$
# \begin{eqnarray}
# \pi_t & = & p \cdot \text{sales}_t - r \cdot x_{t+1} - c \cdot (\text{order made in period }t) \\
# & = & s_t p - k_t r - c \mathbb{1}\{q_t>0\}
# \end{eqnarray}
# $$
# Assuming all $ q_t \ge 0 $, let $ \sigma = \{q_t\}_{t=1,\dots,T} $ denote a feasible inventory policy.
#
# If $ d_t $ is stochastic the policy becomes a function of the period $ t $ inventory $ x_t $.
#
# The expected profit maximizing problem is given by
#
# $$
# {\max}_{\sigma} \mathbb{E}\Big[ \sum_{t=0}^{T} \beta^t \pi_t \Big],
# $$
#
# where $ \beta $ is discount factor.
# #### Bellman equation for the problem
#
# Decisions: $ q_t $, how much new inventory to order
#
# What is important for the inventory decision at time period $ t $?
# - instanteneous utility (profit) contains $ x_t $ and $ d_t $
# - timing: (beginning of period) - current inventory - demand - (choice) order - stored inventory - (end of period)
#
# So, both $ x_t $ and $ d_t $ are taken into account for the new order to be made, forming the state space.
# $$
# \begin{eqnarray}
# V(x_t,d_t) &=& \max_{q_t \ge 0} \Big\{ \pi_t + \beta \mathbb{E}\Big[ V\big(x_{t+1} , d_{t+1} \big) \Big| x_t,d_t,q_t \Big] \Big\} \\
# &=& \max_{q_t \ge 0} \Big\{ s_t p - k_t r - c \mathbb{1}\{q_t>0\}
# + \beta \mathbb{E}\Big[ V\big( k_t, d_{t+1} \big) \Big] \Big\}
# \end{eqnarray}
# $$
#
# $$
# \begin{eqnarray}
# s_t &=& \min\{x_t,d_t\} \\
# k_t &=& \max\{x_t-d_t,0\} + q_t
# \end{eqnarray}
# $$
# The expectation in the Bellman equation is taken over the distribution of the next period demand $ d_{t+1} $, which we assume is independent of any other variables and across time (idiosyncratic), thus the conditioning on $ (x_t,d_t,s_t) $ can be meaningfully dropped.
#
# Expectation can be written as an integral over the distribution of demand $ F(d) $, and since inventory is discrete it’s natural to assume demand is as well.
#
# The integral then transforms into a sum over the possible value of demand, weighted by their probabilities $ pr(d) $
#
# $$
# \begin{eqnarray}
# V(x_t,d_t)
# &=& \max_{q_t \ge 0} \Big\{ s_t p - k_t r - c \mathbb{1}\{q_t>0\}
# + \beta \int V\big( k_t, d \big) \partial F(d) \Big\} \\
# &=& \max_{q_t \ge 0} \Big\{ s_t p - k_t r - c \mathbb{1}\{q_t>0\}
# + \beta \sum_d V\big( k_t, d \big) pr(d) \Big\}
# \end{eqnarray}
# $$
# #### Today: deterministic case
#
# Let $ d $ be fixed and constant across time. How does the Bellman equation change?
#
# In the deterministic case with fixed $ d $, it can be simply dropped from the state space, and the Bellman equation can be simplified to
#
# $$
# \begin{multline}
# V(x_t) = \max_{q_t \ge 0} \big\{ p \min\{x_t,d\} - r \big[ \max\{x_t-d,0\} + q_t \big] \\ - c \mathbb{1}\{q_t>0\}
# + \beta V\big( \max\{x_t-d,0\} + q_t \big) \big\}
# \end{multline}
# $$
# In[5]:
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['figure.figsize'] = [12, 8]
class inventory_model:
'''Small class to hold model fundamentals and its solution'''
def __init__(self,label='noname',
max_inventory=10, # upper bound on the state space
c = 3.2, # fixed cost of order
p = 2.5, # profit per unit of good
r = 0.5, # storage cost per unit of good
β = 0.95, # discount factor
demand = 4 # fixed demand
):
'''Create model with default parameters'''
self.label=label # label for the model instance
self.c, self.p, self.r, self.β = c, p, r, β
self.demand = demand
# created dependent attributes (it would be better to have them updated when underlying parameters change)
self.n = max_inventory+1 # number of inventory levels
self.upper = max_inventory # upper boundary on inventory
self.x = np.arange(self.n) # all possible values of inventory (state space)
def __repr__(self):
'''String representation of the model'''
return 'Inventory model labeled "{}"\nParamters (c,p,r,β) = ({},{},{},{})\nDemand={}\nUpper bound on inventory {}' \
.format (self.label,self.c,self.p,self.r,self.β,self.demand,self.upper)
def sales(self,x,d):
'''Sales in given period'''
return np.minimum(x,d)
def next_x(self,x,d,q):
'''Inventory to be stored, becomes next period state'''
return x - self.sales(x,d) + q
def profit(self,x,d,q):
'''Profit in given period'''
return self.p * self.sales(x,d) - self.r * self.next_x(x,d,q) - self.c * (q>0)
# In[6]:
model=inventory_model(label='test')
print(model)
q=np.zeros(model.n)
print('Current profits with zero orders\n',model.profit(model.x,model.demand,q))
# In[7]:
# illustration of broadcasting in the inventory model
q=model.x[:,np.newaxis] # column vector
print('Current inventory\n',model.x)
print('Current sales\n',model.sales(model.x,model.demand))
print('Current orders\n',q)
print('Next period inventory\n',model.next_x(model.x,model.demand,q))
print('Current profits\n',model.profit(model.x,model.demand,q))
# In[8]:
def bellman(m,v0):
'''Bellman equation for inventory model
Inputs: model object
next period value function
'''
# create the grid of choices (same as x), column-vector
q = m.x[:,np.newaxis]
# compute current period profit (relying on numpy broadcasting to get the matrix with choices in rows)
p = m.profit(m.x,m.demand,q)
# indexes for next period value with extrapolation using last value
i = np.minimum(m.next_x(m.x,m.demand,q),m.upper)
# compute the Bellman maximand
vm = p + m.β*v0[i]
# find max and argmax
v1 = np.amax(vm,axis=0) # maximum in every column
q1 = np.argmax(vm,axis=0) # arg-maximum in every column = order volume
return v1, q1
# In[9]:
v = np.zeros(model.n)
for i in range(3):
v,q = bellman(model,v)
print('Value =',v,'Policy =',q,sep='\n',end='\n\n')
# ### Backwards induction algorithm
#
# Solver for the finite horizon dynamic programming problems
#
# 1. Start at $ t=T $
# 1. Solve Bellman equation at $ t $, record optimal choice
# 1. Decrease $ t $ unless $ t=1 $, and return to previous step.
#
#
# As result, for all $ t=1,\dots,T $ have found the optimal choice (as a function of state)
# In[10]:
def solver_backwards_induction(m,T=10,verbose=False):
'''Backwards induction solver for the finite horizon case'''
# solution is time dependent
m.value = np.zeros((m.n,T))
m.policy = np.zeros((m.n,T))
# main DP loop (from T to 1)
for t in range(T,0,-1):
if verbose:
print('Time period %d\n'%t)
j = t-1 # index of value and policy functions for period t
if t==T:
# terminal period: ordering zero is optimal
m.value[:,j] = m.profit(m.x,m.demand,np.zeros(m.n))
m.policy[:,j] = np.zeros(m.n)
else:
# all other periods
m.value[:,j], m.policy[:,j] = bellman(m,m.value[:,j+1]) # next period to Bellman
if verbose:
print(m.value,'\n')
# return model with updated value and policy functions
return m
# In[11]:
model = inventory_model(label='illustration')
model=solver_backwards_induction(model,T=5,verbose=True)
print('Optimal policy:\n',model.policy)
# In[12]:
def plot_solution(model):
plt.step(model.x,model.value)
plt.legend([f'{i+1}' for i in range(model.value.shape[1])])
plt.title('Value function')
plt.show()
plt.step(model.x,model.policy)
plt.legend([f'{i+1}' for i in range(model.policy.shape[1])])
plt.title('Policy function (optimal order sizes)')
plt.show()
plot_solution(model)
# In[13]:
mod = inventory_model(label='production',max_inventory=50)
mod.demand = 15
mod.c = 5
mod.p = 2.5
mod.r = 1.4
mod.β = 0.975
mod = solver_backwards_induction(mod,T=15)
plot_solution(mod)
# #### Next steps
#
# - Stochastic demand
# - Infinite horizon
#
#
# More appropriate setup for the dynamic programming solution, so today only a simple special case
# ### Further learning resources
#
# - 📖 Adda and Russell Cooper “Dynamic Economics. Quantitative Methods and Applications.” *Chapters: 2, 3.3*
# - Bellman equation [https://en.wikipedia.org/wiki/Bellman_equation](https://en.wikipedia.org/wiki/Bellman_equation)
# - Computer science view on DP [https://www.techiedelight.com/introduction-dynamic-programming](https://www.techiedelight.com/introduction-dynamic-programming)
# - Popular optimal stopping [https://www.americanscientist.org/article/knowing-when-to-stop](https://www.americanscientist.org/article/knowing-when-to-stop)
]