import warnings
warnings.filterwarnings("ignore")
import datetime
import pandas as pd
# import pandas.io.data
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import sys
import sompylib.sompy as SOM# from pandas import Series, DataFrame
from ipywidgets import interact, HTML, FloatSlider
import tensorflow as tf
%matplotlib inline
# The agent has four actions (except in the corners)
#Left, Right, Up, Down
def action_LRUD(N):
Action = np.zeros((N*N,N*N))
# We have four actions everywhere
#Top left corner
i=0
j=0
state = i*N + j
right = state + 1
# left = state -1
# up = (i-1)*N + j
down = (i+1)*N +j
Action[state,right] = 1/2.
# Action[state,left] = 1/4.
# Action[state,up] = 1/4.
Action[state,down] = 1/2.
#Top right corner
i=0
j=N-1
state = i*N + j
# right = state + 1
left = state -1
# up = (i-1)*N + j
down = (i+1)*N +j
# Action[state,right] = 1/4.
Action[state,left] = 1/2.
# Action[state,up] = 1/4.
Action[state,down] = 1/2.
#Lower left
i=N-1
j=0
state = i*N + j
right = state + 1
# left = state -1
up = (i-1)*N + j
# down = (i+1)*N +j
Action[state,right] = 1/2.
# Action[state,left] = 1/2.
Action[state,up] = 1/2.
# Action[state,down] = 1/2.
# Lower right
i=N-1
j=N-1
state = i*N + j
# right = state + 1
left = state -1
up = (i-1)*N + j
# down = (i+1)*N +j
# Action[state,right] = 1/4.
Action[state,left] = 1/2.
Action[state,up] = 1/2.
# Action[state,down] = 1/2.
#Top row
i=0
for j in range(1,N-1):
state = i*N + j
right = state + 1
left = state -1
# up = (i-1)*N + j
down = (i+1)*N +j
Action[state,right] = 1/3.
Action[state,left] = 1/3.
# Action[state,up] = 1/3.
Action[state,down] = 1/3.
#Last row
i=N-1
for j in range(1,N-1):
state = i*N + j
right = state + 1
left = state -1
up = (i-1)*N + j
# down = (i+1)*N +j
Action[state,right] = 1/3.
Action[state,left] = 1/3.
Action[state,up] = 1/3.
# Action[state,down] = 1/3.
#Left col
j=0
for i in range(1,N-1):
state = i*N + j
right = state + 1
# left = state -1
up = (i-1)*N + j
down = (i+1)*N +j
Action[state,right] = 1/3.
# Action[state,left] = 1/3.
Action[state,up] = 1/3.
Action[state,down] = 1/3.
#Right col
j=N-1
for i in range(1,N-1):
state = i*N + j
# right = state + 1
left = state -1
up = (i-1)*N + j
down = (i+1)*N +j
# Action[state,right] = 1/3.
Action[state,left] = 1/3.
Action[state,up] = 1/3.
Action[state,down] = 1/3.
for i in range(1,N-1):
for j in range(1,N-1):
state = i*N + j
right = state + 1
left = state -1
up = (i-1)*N + j
down = (i+1)*N +j
Action[state,right] = 1/4.
Action[state,left] = 1/4.
Action[state,up] = 1/4.
Action[state,down] = 1/4.
return Action
N = 6
Action = action_LRUD(N)
#Reward
#Everywhere negative
state_reward = -1*np.ones((N*N))
#Except the terminating state with positive reward
terminating_state= np.random.randint(0,N)*N +np.random.randint(0,N)
# middle = int(N/2)*N + int(N/2)
state_reward[terminating_state] = +20
fig = plt.figure(figsize=(5,5));
ax = fig.add_subplot(1,1,1)
ax.xaxis.set_ticks([i for i in range(1,N+1)])
ax.yaxis.set_ticks([i for i in range(1,N+1)])
ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.grid(True,linestyle='-', linewidth=.5)
vals = np.around(state_reward.reshape(N,N)).astype(int)
for i in range(0,N,1):
for j in range(0,N,1):
ax.annotate(vals[i,j], (i+.1,j+.1),size=10, va="center")
for i in range(0,N):
for j in range(0,N):
state = i*N+j
valid_actions = Action[state]>0
next_greedy_state = np.where(valid_actions)[0]
for d in next_greedy_state:
i1 = d/N
j1 = d%N
plt.arrow(i+.5,j+.5,.3*(i1-i),.3*(j1-j),head_width=.15, head_length=.13, fc='r', ec='r');
plt.tight_layout()
N = 10
Action = action_LRUD(N)
#Reward
#Everywhere negative
state_reward = -1*np.ones((N*N))
#Except the terminating state with positive reward
terminating_state= np.random.randint(0,N)*N +np.random.randint(0,N)
# middle = int(N/2)*N + int(N/2)
state_reward[terminating_state] = +20
Gamma = .5
# Value Iteration
#initialize State_Value
state_value = np.zeros((N*N))
delta = 0
deltas = []
for i in range(40):
temp = state_value.copy()
#Update Values
for j in range(Action.shape[0]):
state_value[j] = np.sum(np.multiply(Action[j],state_reward+Gamma*state_value))
diff = np.sum(np.abs(temp-state_value))
delta = np.maximum(delta,diff)
deltas.append(diff)
plt.plot(deltas);
fig = plt.figure(figsize=(7,7));
plt.imshow(state_value.reshape(N,N),cmap=plt.cm.gray_r);
i = terminating_state/N
j = terminating_state%N
plt.plot(j,i,'og',markersize=9)
plt.axis('off')
for i in range(0,N):
for j in range(0,N):
state = i*N+j
valid_actions = Action[state]>0
action_reward = np.around(np.multiply(Action[state],state_value),decimals=3)
try:
max_val = np.max(action_reward[valid_actions])
next_greedy_state = np.where(action_reward==max_val)[0]
for d in next_greedy_state:
i1 = d/N
j1 = d%N
plt.arrow(j,i,.3*(j1-j),.3*(i1-i),head_width=.15, head_length=.13, fc='r', ec='r');
except:
continue
plt.grid()
def value_iteration(Itern=50,Gamma=.4):
# Value Iteration
#initialize State_Value
state_value = np.zeros((N*N))
delta = 0
deltas = []
for i in range(Itern):
temp = state_value.copy()
#Update Values
for j in range(Action.shape[0]):
state_value[j] = np.sum(np.multiply(Action[j],state_reward+Gamma*state_value))
diff = np.sum(np.abs(temp-state_value))
delta = np.maximum(delta,diff)
deltas.append(diff)
state_value = np.around(state_value,decimals=1)
fig = plt.figure(figsize=(14,7));
plt.subplot(1,2,1)
plt.plot(deltas);
plt.subplot(1,2,2)
plt.imshow(state_value.reshape(N,N),cmap=plt.cm.gray_r);
i = terminating_state/N
j = terminating_state%N
plt.plot(j,i,'og',markersize=9)
plt.axis('off')
for i in range(0,N):
for j in range(0,N):
state = i*N+j
valid_actions = Action[state]>0
action_reward = np.around(np.multiply(Action[state],state_value),decimals=2)
try:
max_val = np.max(action_reward[valid_actions])
next_greedy_state = np.where(action_reward==max_val)[0]
for d in next_greedy_state:
i1 = d/N
j1 = d%N
plt.arrow(j,i,.3*(j1-j),.3*(i1-i),head_width=.15, head_length=.13, fc='r', ec='r');
except:
continue
plt.grid()
plt.tight_layout()
interact(value_iteration,Itern=(20,100,10),Gamma=(.0,1,.05));
N = 10
Action = action_LRUD(N)
#Reward
state_reward = -1*np.ones((N*N))
terminating_state= np.random.randint(0,N)*N +np.random.randint(0,N)
# middle = int(N/2)*N + int(N/2)
state_reward[terminating_state] = +20
blocking_state= np.random.randint(1,N-1)*N +np.random.randint(1,N-1)
Action[blocking_state] = 0
Action[:,blocking_state] = 0
state_reward[blocking_state] = -20
blocking_state= np.random.randint(1,N-1)*N +np.random.randint(1,N-1)
Action[blocking_state] = 0
Action[:,blocking_state] = 0
state_reward[blocking_state] = -20
interact(value_iteration,Itern=(20,160,10),Gamma=(.0,1,.05));
N = 10
Action = action_LRUD(N)
#Reward
state_reward = np.random.randint(0,10,size=N*N)
# state_reward = -1*np.ones((N*N))
terminating_state= np.random.randint(0,N)*N +np.random.randint(0,N)
# middle = int(N/2)*N + int(N/2)
state_reward[terminating_state] = +20
fig = plt.figure(figsize=(7,7))
ax = fig.add_subplot(111)
ax.xaxis.set_ticks([i for i in range(1,N+1)])
ax.yaxis.set_ticks([i for i in range(1,N+1)])
ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.grid(True,linestyle='-', linewidth=.5)
vals = np.around(state_reward.reshape(N,N)).astype(int)
for i in range(0,N,1):
for j in range(0,N,1):
ax.annotate(vals[i,j], (i+.1,j+.1),size=10, va="center")
def value_iteration1(Itern=100,Gamma=.5):
# Value Iteration
#initialize State_Value
state_value = np.zeros((N*N))
delta = 0
deltas = []
for i in range(Itern):
temp = state_value.copy()
#Update Values
for j in range(Action.shape[0]):
state_value[j] = np.sum(np.multiply(Action[j],state_reward+Gamma*state_value))
diff = np.sum(np.abs(temp-state_value))
delta = np.maximum(delta,diff)
deltas.append(diff)
state_value = np.around(state_value,decimals=2)
fig = plt.figure(figsize=(14,7));
plt.subplot(1,2,1)
plt.plot(deltas);
# plt.subplot(1,2,2)
ax = fig.add_subplot(1,2,2)
ax.xaxis.set_ticks([i for i in range(1,N+1)])
ax.yaxis.set_ticks([i for i in range(1,N+1)])
ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.grid(True,linestyle='-', linewidth=.5)
vals = np.around(state_value.reshape(N,N)).astype(int)
for i in range(0,N,1):
for j in range(0,N,1):
ax.annotate(vals[i,j], (i+.1,j+.1),size=10, va="center")
# plt.imshow(state_value.reshape(N,N),cmap=plt.cm.gray_r);
i = terminating_state/N +.5
j = terminating_state%N + .5
# ax.plot(j,i,'og',markersize=9)
# plt.axis('off')
for i in range(0,N):
for j in range(0,N):
state = i*N+j
valid_actions = Action[state]>0
action_reward = np.around(np.multiply(Action[state],state_value),decimals=2)
try:
max_val = np.max(action_reward[valid_actions])
next_greedy_state = np.where(action_reward==max_val)[0]
for d in next_greedy_state:
i1 = d/N
j1 = d%N
plt.arrow(i+.5,j+.5,.3*(i1-i),.3*(j1-j),head_width=.15, head_length=.13, fc='r', ec='r');
except:
continue
# plt.grid()
plt.tight_layout()
interact(value_iteration1,Itern=(20,160,10),Gamma=(.0,1,.05));
# Graph of connectivity
plt.imshow(Action)
<matplotlib.image.AxesImage at 0x112036e10>
## Load the environment
import gym
env = gym.make('FrozenLake-v0')
[2017-05-09 12:44:09,573] Making new env: FrozenLake-v0
env.action_space.sample()
0
#Initialize table with all zeros
Q = np.zeros([env.observation_space.n,env.action_space.n])
# Set learning parameters
lr = .85
y = .99
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []
for i in range(num_episodes):
#Reset environment and get first new observation
s = env.reset()
rAll = 0
d = False
j = 0
#The Q-Table learning algorithm
while j < 99:
j+=1
#Choose an action by greedily (with noise) picking from Q table
a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
#Get new state and reward from environment
s1,r,d,_ = env.step(a)
#Update Q-Table with new knowledge
Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
rAll += r
s = s1
if d == True:
break
#jList.append(j)
rList.append(rAll)
s = env.reset()
a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
env.step(a)
(0, 0.0, False, {'prob': 0.3333333333333333})
print "Score over time: " + str(sum(rList)/num_episodes)
Score over time: 0.3775
print "Final Q-Table Values"
print Q
Final Q-Table Values [[ 2.41812747e-01 1.34668144e-02 1.37759130e-02 1.34745022e-02] [ 0.00000000e+00 1.54653562e-02 7.79397923e-05 3.28061161e-01] [ 2.95354171e-03 1.44168576e-02 3.38725506e-03 4.90887652e-01] [ 1.51910228e-03 1.80457987e-02 2.23516011e-03 3.57746406e-01] [ 2.50271470e-01 6.24764384e-04 0.00000000e+00 3.11052109e-04] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 6.24345458e-01 5.55418788e-09 1.72631494e-04 1.42910829e-04] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 1.64678842e-03 5.16343436e-05 2.41407774e-03 5.49872600e-01] [ 0.00000000e+00 8.43591681e-01 0.00000000e+00 0.00000000e+00] [ 6.12664778e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 8.25771751e-01 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.35583578e-01] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]]
from IPython.display import YouTubeVideo
YouTubeVideo('V1eYniJ0Rnk',width=700, height=600)
Comparing to classical computational modeling here we learn, but we don't say how. We don't hardcode the logics and strategies, we learn them
Banks and economic polocies
Supply chain and Inventory management (VMI)
Dynamic Resource Allocation
Simulation Based Optimization