In [1]:
import numpy as np
import tensorflow as tf

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
import seaborn as sns

%matplotlib inline

Iterated Prisoner's Dilemma environment

In [2]:
class IPD():
    """
    A two-agent vectorized environment.
    Possible actions for each agent are (C)ooperate and (D)efect.
    """
    NUM_AGENTS = 2
    NUM_ACTIONS = 2
    NUM_STATES = 5 #(CC CD DC DD 0)

    def __init__(self, max_steps, batch_size=1):
        self.max_steps = max_steps
        self.batch_size = batch_size
        self.payout_mat = np.array([[-1., 0.], [-3., -2.]])
        self.available_actions = [
            np.ones((batch_size, self.NUM_ACTIONS), dtype=int)
            for _ in range(self.NUM_AGENTS)
        ]

        self.step_count = None

    def reset(self):
        self.step_count = 0
        init_state = np.zeros((self.batch_size, self.NUM_STATES))
        init_state[:, -1] = 1
        observations = [init_state, init_state]
        info = [{'available_actions': aa} for aa in self.available_actions]
        return observations, info
    
    # a single step
    def step(self, action):
        ac0, ac1 = action # action = [ac1, ac2], aci = [a1, a_2, ..., a_{batch_size}]

        self.step_count += 1

        rewards = []
        state0 = np.zeros((self.batch_size, self.NUM_STATES))
        state1 = np.zeros((self.batch_size, self.NUM_STATES))
        for i, (a0, a1) in enumerate(zip(ac0, ac1)):
            rewards.append([self.payout_mat[a1][a0], self.payout_mat[a0][a1]])
            state0[i, a0 * 2 + a1] = 1
            state1[i, a1 * 2 + a0] = 1
        rewards = list(map(np.asarray, zip(*rewards))) # [rewards1, rewards2]
        observations = [state0, state1]

        done = (self.step_count == self.max_steps)
        info = [{'available_actions': aa} for aa in self.available_actions]

        return observations, rewards, done, info

Helper functions

In [3]:
tf.reset_default_graph()

def get_mat(game):
    if game == 'p': 
        payout_mat_1 = np.array([[-1,0],[-3,-2]])
        payout_mat_2= payout_mat_1.T
    else:
        payout_mat_1 = np.array([[1,-1],[-1,1]])
        payout_mat_2 = -1*payout_mat_1
    gamma = 0.96
    return payout_mat_1, payout_mat_2, gamma

gamma = tf.placeholder(shape=[],dtype=tf.float32) # discount vector

def build_policy(scope, env, theta, max_steps, reuse=None):
    pi = {}
    with tf.variable_scope(scope, reuse=reuse):
        # Placeholders
        pi['acs_ph'] = tf.placeholder(shape=[None, None], dtype=tf.int32)                   # [steps, batch]
        pi['obs_ph'] = tf.placeholder(shape=[None, None, env.NUM_STATES], dtype=tf.float32) # [steps, batch, 5]
        pi['rews_ph'] = tf.placeholder(shape=[None, None], dtype=tf.float32)                # [steps, batch]
        pi['target'] = tf.placeholder(shape=[None, None], dtype=tf.float32)                 # [setps, batch]
        pi['discount_vec'] = tf.placeholder(shape=[None, 1, 1], dtype=tf.float32)           # [steps, 1, 1]
        
        
        # Parameters
        pi['theta'] = theta   # 5
        pi['theta_val'] = tf.Variable(tf.random_normal([env.NUM_STATES])) # 5
        pi['theta_val_target'] = tf.Variable(tf.random_normal([env.NUM_STATES])) # 5

        # Log probs and predictions
        logits = tf.reduce_sum(
            tf.multiply(pi['obs_ph'], tf.reshape(pi['theta'], shape=(1, 1, -1))),
            axis=-1, keepdims=True)
        logits = tf.concat([logits, tf.zeros_like(logits)], -1)
        
        # [step, batch, 1]
        pi['value'] = tf.reduce_sum(
            tf.multiply(pi['obs_ph'], tf.reshape(pi['theta_val'], shape=(1, 1, -1))),
            axis=-1, keepdims=True)
        
        # (step, batch, 1)
        pi['value_target'] = tf.reduce_sum(
            tf.multiply(pi['obs_ph'], tf.reshape(pi['theta_val_target'], shape=(1, 1, -1))),
            axis=-1, keepdims=True)
        
        
        pi['log_pi'] = tf.nn.log_softmax(logits) # 
        pi['acs_onehot'] = tf.one_hot(
            pi['acs_ph'], env.NUM_ACTIONS, dtype=tf.float32)
        pi['log_pi_acs'] = tf.reduce_sum(
            tf.multiply(pi['log_pi'], pi['acs_onehot']),
            axis=-1)
        pi['pi'] = tf.nn.softmax(logits)
        pi['pi_acs'] = tf.reduce_sum(
            tf.multiply(pi['pi'], pi['acs_onehot']),
            axis=-1)

        ac_logp0_cumsum = [tf.reshape(pi['log_pi_acs'][0], [1, -1]) ]
        for i in range(1,max_steps):
            ac_logp0_cumsum.append(tf.add(ac_logp0_cumsum[-1], pi['log_pi_acs'][ i]))
        pi['log_pi_acs_cumsum'] = tf.concat(ac_logp0_cumsum,0)
        
        pi['predict'] = tf.squeeze(tf.multinomial(
            tf.reshape(pi['log_pi'], shape=(-1, env.NUM_ACTIONS)), 1))
        
        pi['loss_value'] = tf.reduce_mean(
            tf.reduce_sum( tf.pow(tf.squeeze(pi['value'])  - pi['target'], 2), axis=0 ) # sum over all steps
            ) # average over all batches
    return pi


# DiCE operator
def magic_box(x):
    return tf.exp(x - tf.stop_gradient(x))


def get_naive_hessian_pi_squared(policies):
    losses = [ tf.reduce_mean(tf.reduce_sum(tf.multiply(pi['pi_acs'] / tf.stop_gradient(pi['pi_acs']), pi['rews_ph'] )),axis=0) for pi in policies]
    return losses


def gradient(x, theta):
    return tf.gradients(x, theta)[0]


# The DiCE objective
# pi['rews_ph']: [step, batch]
def get_dice_objective(scope, policies):
    dependencies = magic_box( sum( pi['log_pi_acs_cumsum'] for pi in policies) ) 
    baseline = 1 - magic_box( sum( pi['log_pi_acs'] for pi in policies) ) # (step, batch)
    losses = [
        tf.reduce_mean( 
            tf.reduce_sum( tf.multiply(pi['rews_ph'], dependencies), axis=0 ) # sum over all steps
        ) + # average over all batches
        tf.reduce_mean(
            tf.reduce_sum(
                tf.multiply(
                    tf.squeeze( tf.stop_gradient( tf.multiply(pi['value'], pi['discount_vec']) ) ), # [step, batch, 1] -> [step, batch]
                    baseline # [step, batch]
                ),
            axis=0)
        )
        for pi in policies
    ]
    return losses
  
def get_baseline(scope, policies):
    baseline = (1 - magic_box( sum( pi['log_pi_acs'] for pi in policies) ))[1:] # (step, batch)
    dependencies = magic_box( sum( pi['log_pi_acs_cumsum'] for pi in policies) )[:-1]
    product = tf.multiply(baseline, 1 - dependencies)
    losses = [
        -tf.reduce_mean(
            tf.reduce_sum(
                tf.multiply(product, tf.squeeze( tf.stop_gradient( tf.multiply(pi['value'], pi['discount_vec']) )[1:] ) ),
            axis=0)
        )
        for pi in policies
    ]
    return losses

def get_actor(pi, sess):
    def actor(ob):
        ac = sess.run(pi['predict'], feed_dict={pi['obs_ph']: [ob]})
        return ac
    return actor

# sample grad log pi
def get_gradient(pi, sess, ob, ac):
    objective = tf.reshape(pi['log_pi_acs'], [-1])
    examples = tf.split(objective, num_or_size_splits=len(ob))
    thetas = pi['theta']
    grads = [tf.gradients(obj, thetas) for obj in examples]
    res = sess.run(grads, feed_dict={pi['obs_ph']: [ob], pi['acs_ph']: [ac]})
    return res

# sample grad log pi analytically
def get_gradient_analytic(pi, sess, ob, ac):
    left = [o * (1 - mul) for o, mul in zip(ob, ac)]
    probs = sess.run(pi['pi'], feed_dict={pi['obs_ph']: [ob]})
    prob0s = probs[:,:,0] # the probablity of coop
    prob0s = prob0s[0]
    right = [o * prob for o, prob in zip(ob, prob0s)]
    res = [l - r for l, r in zip(left, right)]
    return res


def Vs(theta_1_all_5_dim, theta_2_all_5_dim, R_1, R_2):
    theta_1 = tf.slice(theta_1_all_5_dim, [0,0], [4,1] )
    theta_2 = tf.slice(theta_2_all_5_dim, [0,0], [4,1] )

    theta_1_0 = tf.slice(theta_1_all_5_dim, [4,0], [1,1] )
    theta_2_0 = tf.slice(theta_2_all_5_dim, [4,0], [1,1] )

    p_1 = tf.nn.sigmoid(theta_1)
    p_2 = tf.nn.sigmoid(theta_2)

    p_1_all = tf.nn.sigmoid(theta_1_all_5_dim)
    p_2_all = tf.nn.sigmoid(theta_2_all_5_dim)

    p_1_0 = tf.nn.sigmoid(theta_1_0)
    p_2_0 = tf.nn.sigmoid(theta_2_0)

    p_1_0_v = tf.concat([p_1_0, (1-p_1_0)], 0)
    p_2_0_v = tf.concat([p_2_0, (1-p_2_0)], 0)

    s_0 = tf.reshape(tf.matmul(p_1_0_v, tf.transpose(p_2_0_v)),[-1,1])

    P = tf.concat([tf.multiply(p_1,p_2), tf.multiply(p_1,(1-p_2)),tf.multiply(1-p_1,(p_2)), tf.multiply(1-p_1,(1-p_2))],1)
    
    I_m_P = tf.diag([1.0,1.0,1.0,1.0]) - P*gamma
    v_1 = tf.matmul(tf.matmul(tf.matrix_inverse( I_m_P),R_1), s_0,transpose_a=True)
    v_2 = tf.matmul(tf.matmul(tf.matrix_inverse( I_m_P),R_2), s_0,transpose_a=True)
    return [v_1, v_2, p_1_all, p_2_all]

#Placeholder for the flattened version of the payout matrix
R_1 = tf.placeholder(shape=[4,1],dtype=tf.float32)
R_2 = tf.placeholder(shape=[4,1],dtype=tf.float32)

Training initialisation

In [4]:
# Experiment parameters
n_agents = 2
gamma_val = .96

batch_size = 128
batch_size_large = 1000
max_steps= 100
train_episodes = 400


env = IPD(max_steps=max_steps, batch_size=batch_size)
env_large = IPD(max_steps=max_steps, batch_size=batch_size_large)

#theta_all = tf.Variable(tf.random_normal([2 * env.NUM_STATES]))
theta_all = tf.Variable(tf.random_normal([2 * env.NUM_STATES], mean=0.0, stddev=5))
theta_1 = theta_all[:5]
theta_2 = theta_all[5:]

thetas = [theta_1, theta_2]

thetas_all = [theta_all, theta_all]

# build policies using randomly generated thetas
policies = [build_policy("pi_%d" % (i + 1), env, theta, max_steps) for i, theta in enumerate(thetas)]

# DiCE objectives for two players
v_1_p, v_2_p = get_dice_objective("delta", policies)
b_1, b_2 = get_baseline("delta", policies)

[v_1, v_2, p_1_all, p_2_all] = Vs(tf.reshape(theta_1, [-1,1]), tf.reshape(theta_2, [-1,1]), R_1, R_2)

# Training
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.005)
loss = sum( [ pi['loss_value'] for pi in policies ])
train = optimizer.minimize(loss)

assign_target = [ pi['theta_val_target'].assign( pi['theta_val']) for pi in policies ]

# for all parameters
grad_p = [ gradient(v, theta) for v, theta in zip([v_1_p, v_2_p], thetas_all)] # prediction
grad_exact = [ gradient(v, theta) for v, theta in zip([v_1, v_2], thetas_all)] # exact
h_exacts = [tf.hessians(v, theta) for v, theta in zip([v_1, v_2], thetas_all)]
h_ps = [tf.hessians(v, theta) for v, theta in zip([v_1_p, v_2_p], thetas_all)]
grad_b = [gradient(v, theta) for v, theta in zip([b_1, b_2], thetas_all)]
h_b = [tf.hessians(v, theta) for v, theta in zip([b_1, b_2], thetas_all)]

def get_feed_dict(obs, acs, rews, target, disc_count):
    feed_dict = dict(sum([
        [(pi['obs_ph'], obs[i]), (pi['acs_ph'], acs[i]), (pi['rews_ph'], rews[i]), (pi['target'], target[i]),
         (pi['discount_vec'],disc_count)]
        for i, pi in enumerate(policies)
    ], []))
    return feed_dict

# initialise variables
init = tf.global_variables_initializer()

Training

In [5]:
payout_mat_1, payout_mat_2, gamma_val =  get_mat('p')
disc_count = gamma_val**(np.array(range(max_steps))).reshape([-1, 1, 1])

r1  = np.reshape(payout_mat_2,[-1,1])
r2  = np.reshape(payout_mat_1,[-1,1])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(assign_target)
    
    actors = [get_actor(pi, sess) for pi in policies] # actors = [actor1(ob), actor2(ob)] 
    
    # train episodes
    for e in range(train_episodes):
        # change the last episode to large environment set
        if e == train_episodes - 1:
            env = env_large
        
        obs, acs, rets, rews = [], [], [], []
        
        # initialisation
        t = 0 
        ob, info = env.reset()
        done = False
        gamma_t = 1.                     #gamma_0 = 1
        gamma_list = np.zeros(max_steps) #[[0, 0, ..., 0]]
        
        # a single trajectory, having max_step steps
        while not done:
            #print('time: ', t)
            obs.append(ob)                          # update observations
            ac = [a(o) for a, o in zip(actors, ob)] # decide what actions to do next, ac = [ac1, ac2]
            
            ob, rew, done, _ = env.step(ac)
            ob[1] = ob[0] 
            acs.append(ac)                          # update actions       
            rets.append([r * gamma_t for r in rew]) # [r1*gamma_t, r_2*gamma_t]
            rews.append(rew)                        # update rewards           
       
            # update discounting factor
            gamma_list[t]=gamma_t
            gamma_t *= gamma_val
            t += 1
        
        # change to np.array
        obs = list(map(np.asarray, zip(*obs)))
        acs = list(map(np.asarray, zip(*acs)))
        rets = list(map(np.asarray, zip(*rets)))
        rews = list(map(np.asarray, zip(*rews)))
        
        final_return = [sess.run(pi['value_target'], {pi['obs_ph']: [o]}) for pi, o in zip(policies, ob)] # [(1, batch, 1), (1, batch, 1)]
        
        #Cummulative return logic, obtain the target
        for ret, final in zip(rets, final_return):
            ret[-1] += gamma_t*final[0,:,0]
            
        # R_t
        target = [np.copy(rew) for rew in rews] #[(step, batch), (step, batch)]
        cum_rew =  [final[0,:,0] for final in final_return]
        for t_p in range(t)[::-1]: # t_p= t - 1, ..., 0 
            cum_rew = [ c_rew*gamma_val + r_ex[t_p] for c_rew, r_ex in zip(cum_rew, rews)]
            for i in range(env.NUM_AGENTS):
                target[i][t_p] = np.copy(cum_rew[i]) 
        
        # feed data
        approx_feed = get_feed_dict(obs, acs, rets, target, disc_count)
        feed_dict = { R_1:r1, R_2: r2, gamma: gamma_val }
        
        v1p, v2p, _, l = sess.run([v_1_p, v_2_p, train, loss ], approx_feed) # feed data and train 
        v1, v2 = sess.run( [v_1, v_2], feed_dict)
        
        # print loss during training every 40 episodes
        if e % 40 == 0:
            sess.run(assign_target) # update parameters for value
            print('loss', l)
        
        # get the values 
        v1p, v2p, hp, gradp, gradb, hb = sess.run([v_1_p, v_2_p, h_ps, grad_p, grad_b, h_b], approx_feed) 
        v1, v2, hexact, gradexact = sess.run( [v_1, v_2, h_exacts, grad_exact], feed_dict) # exact
loss 101623.09
loss 6008.2017
loss 1758.8754
loss 1147.6475
loss 917.39984
loss 754.1265
loss 672.46747
loss 784.5248
loss 680.8012
loss 697.8163

Plot correlations for grads/hessians

In [6]:
print("Sampled per step Returns:", [v1p*(1-gamma_val ), v2p*(1-gamma_val)])
print("Exact per step Returns:", [v1[0][0]*(1-gamma_val), v2[0][0]*(1-gamma_val)])

# Plot gradients
plt.figure()
for i in range(2):
    plt.plot( gradp[i], gradexact[i], 'o')
    print('corr ', str(i), np.corrcoef( gradp[i], gradexact[i])[1,0])

hexact = np.reshape(hexact, [2, 100])
hp = np.reshape(hp, [2, 100])
hb = np.reshape(hb, [2, 100])

# Plot Hessian
plt.figure()
for i in range(2):
    plt.plot(hp[i], hexact[i], 'o')
    print('second_corr ', str(i), np.corrcoef(hp[i], hexact[i])[1,0])


plt.figure()
for i in range(2):
    plt.plot(hp[i], hb[i], 'o')
    print('second_corr_baseline ', str(i), np.corrcoef(hp[i] + hb[i], hexact[i])[1,0])
    
Sampled per step Returns: [-1.4273268127441419, -0.862939529418946]
Exact per step Returns: [-1.4321441650390638, -0.8601446533203133]
corr  0 0.9998602205586556
corr  1 0.9993555487495428
second_corr  0 -0.8453817006111741
second_corr  1 0.7031717474433741
second_corr_baseline  0 0.950040721072172
second_corr_baseline  1 0.9739433064527004

Plot flattened grads/hessians

In [7]:
sns.set_style("white", {'axes.grid': False, 'grid.color': '.9', 'grid.linestyle': u'--'}) 
matplotlib.rc('axes', titlesize=20, labelsize=16) 
matplotlib.rc('legend', fontsize=18) 
matplotlib.rc('xtick', labelsize=20) 
matplotlib.rc('ytick', labelsize=20) 
#matplotlib.rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']}) # Plot hessian values

# full parameters

fig1 = plt.figure(figsize = (8, 5))    
for value in [0,1]:
    ax = plt.subplot(2,1, value+1)
    plt.plot(  gradexact[value].reshape([-1]),'r')
    plt.plot(  gradp[value].reshape([-1]),'g')
    if value == 0:
        ax.get_xaxis().set_visible(False)
        ax.set_title("Gradient")
    plt.xlim([0, 4.1])
    plt.ylabel('Estimate')
    plt.xlabel('Gradient dimension')
    plt.xticks([0,1,2,3,4])
    
# Hessian
fig2 = plt.figure(figsize = (16,7))
for value in [0, 1]:
    ax = plt.subplot(2,2,2*value+1)
    plt.plot(  hexact[value].reshape([-1]),'r')
    plt.plot(  hp[value].reshape([-1]),'g')
    if value == 0:
        ax.get_xaxis().set_visible(False)
        ax.set_title("(a) Hessian")
    plt.xlim([0, 24.1])
    plt.ylabel('Estimate')
    plt.xlabel('Hessian dimension')
    plt.xticks([0,5,10,15,20])

for value in [0, 1]:
    ax = plt.subplot(2,2,2*value+2)
    plt.plot(  hexact[value].reshape([-1]),'r')
    plt.plot(  (hb[value] + hp[value]).reshape([-1]),'g')
    if value == 0:
        ax.get_xaxis().set_visible(False)
        ax.set_title("(b) Hessian with Baseline ")
    plt.xlim([0, 24.1])
    plt.ylabel('Estimate')
    plt.xlabel('Hessian dimension')
    plt.xticks([0,5,10,15,20])