In [667]:
import tensorflow as tf
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [253]:
# 假设老虎机标准差为 1/2 倍均值
# 假设所获得的奖励符合高斯分布
BANDITS = [0.10, -0.20, 1.0, -5.0, 5.0, -20.0, 10.0, 3.0, -2.0, -1.0]
def bandit_reward(bandit):
# 所有 bandits 的期望
mu = BANDITS[bandit]
sigma = np.abs(mu)/2.0
return np.random.normal(mu, sigma, 10)

In [254]:
# 每次拉老虎机，随机从奖励结果中取一个
def pull_bandit(bandit):
banditreward = bandit_reward(bandit)
choose = np.random.randint(0,len(BANDITS)-1)
return banditreward[choose]

In [255]:
# 验证一下, 可以发现基本能够达到期望值
for i, rew in enumerate(BANDITS):
result = []
# 重复 1000 次
for j in range(1000):
result.append(pull_bandit(i))
print("Actual reward: %s; Estimated reward: %s" % (rew, np.average(result)))

Actual reward: 0.1; Estimated reward: 0.09796116798626889
Actual reward: -0.2; Estimated reward: -0.20212861914465624
Actual reward: 1.0; Estimated reward: 1.0073434318381687
Actual reward: -5.0; Estimated reward: -4.944275196351961
Actual reward: 5.0; Estimated reward: 5.044439253974279
Actual reward: -20.0; Estimated reward: -19.83753643354374
Actual reward: 10.0; Estimated reward: 9.93844320243506
Actual reward: 3.0; Estimated reward: 2.9231116309948253
Actual reward: -2.0; Estimated reward: -2.016187174943
Actual reward: -1.0; Estimated reward: -0.9936313263386887


# greedy¶

In [287]:
# 贪心算法

total_reward = 0
try_reward = []
trial = 3
ite = 0
total_iter = 1000
while ite < total_iter:
# 各 三次 实验求平均，然后以贪心算法选择最大的 Q
if ite < trial * len(BANDITS):
for i in range(len(BANDITS)):
ite += 1
reward = round(pull_bandit(i),1)
try_reward.append(reward)
total_reward += reward
else:
Q_table = np.average(np.array(try_reward).reshape(trial, len(BANDITS)), axis=0)
# 保留两位小数
Q_table = list(np.around(Q_table, decimals=1))
Q = max(Q_table)
index = Q_table.index(Q)
ite += 1
total_reward += Q

assert sum(try_reward) + Q*(total_iter - trial * len(BANDITS)) - total_reward < 1.0
print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[index]))
print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1))
print("实际中的 Q_table: %s\n" % BANDITS)
if index == BANDITS.index(max(BANDITS)):
print("You have chosen the best bandit!")
else:
print("You haven't chosen the best bandit....")

获得的总收益：8047，你选择的老虎机期望收益：5.0...

You haven't chosen the best bandit....


• BANDITS 期望一定时：如果标准差很小，那么选择基本会限定在最高的几个老虎机上，如果标准差非常大，则有可能选择到其他的老虎机。
• BANDITS 标准差一定时：期望相差越大，越容易选到最高的几个老虎机上，期望相差越小，则可能选到其他老虎机。

# epson-greedy¶

In [674]:
# epson-贪心算法1：上来就探索

def epson_greedy(epson):
total_reward = 0
is_best = 0
epson_num = 0
try_reward = []
ite = 0
total_iter = 1000
Q_table = [0] * len(BANDITS)
while ite < total_iter:
if random.random() < epson:
ite += 1
epson_num += 1
choose = np.random.randint(0,len(BANDITS))
rew = pull_bandit(choose)
total_reward += rew
# 更新的值保留两位小数，方便看结果
Q_table[choose] = round(rew, 2)
Q = max(Q_table)
index = Q_table.index(Q)
# 保留两位小数
else:
ite += 1
Q = max(Q_table)
index = Q_table.index(Q)
total_reward += Q

print("共探索了 %s 次." % epson_num)
print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[index]))
print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1))
print("实际中的 Q_table: %s\n" % BANDITS)
if index == BANDITS.index(max(BANDITS)):
print("You have chosen the best bandit!")
is_best = 1
else:
print("You haven't chosen the best bandit....")
is_best = 0


In [675]:
# epson = 0.1，10% 的概率在探索新的老虎机
epson_greedy(0.1)

共探索了 96 次.

You have chosen the best bandit!

Out[675]:
(9815.84295988971, 1)
In [735]:
# epson-贪心算法2：先确定一个 Q table 再探索，结合了贪心算法

def epson_greedy2(epson):
total_reward = 0
is_best = 0
epson_num = 0
try_reward = []
trial = 3
ite = 0
total_iter = 1000
Q_table = [0] * len(BANDITS)
while ite < total_iter:

# 先三次实验确定 Q_table，再探索
if ite < trial * len(BANDITS):
for i in range(len(BANDITS)):
ite += 1
reward = round(pull_bandit(i),1)
try_reward.append(reward)
total_reward += reward

else:
Q_table = np.average(np.array(try_reward).reshape(trial, len(BANDITS)), axis=0)
Q_table = list(np.around(Q_table, decimals=1))
# epson 概率选择下一个要拉的老虎机，1-epson 选目前奖励最高的
# 如果是非静态的 bandit，可以把 Q_table 中每台 bandit 的 reward 存成 list，然后
# 用 Q_{n+1} = \sum_{i=1}^n \alpha (1-\alpha)^{n-i} R_i + (1-\alpha)^{n} Q_1 计算即可
# alpha 为超参数，R_i 为存储的所有 Reward，Q1 为第一次的 Reward
if random.random() < epson:
epson_num += 1
choose = np.random.randint(0,len(BANDITS))
rew = pull_bandit(choose)
total_reward += rew
# 更新的值保留两位小数，方便看结果
Q_table[choose] = round(rew, 2)
else:
ite += 1
Q = max(Q_table)
index = Q_table.index(Q)
total_reward += Q

print("共探索了 %s 次." % epson_num)
print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[index]))
# 如果你选的 Q（两位小数）没有在 Q_table，说明最后一次的 Q_table 是 Q_table_new，最大的 Q 是探索出来的
# 实际中不需要 新建一个 Qtable，这里为了看起来更加直观
print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1))
print("实际中的 Q_table: %s\n" % BANDITS)
if index == BANDITS.index(max(BANDITS)):
print("You have chosen the best bandit!")
is_best = 1
else:
print("You haven't chosen the best bandit....")
is_best = 0

In [737]:
# epson = 0.1，10% 的概率在探索新的老虎机
epson_greedy2(0.1)

共探索了 99 次.

You have chosen the best bandit!

Out[737]:
(14300.636224221827, 1)

epson-贪心小结：

• 获得最高的收益时，不一定就是选择了期望收益最大的老虎机。相反也是。当然大部分的情况下，最高的收益对应的是选择了期望最大的老虎机。
• 对 BANDITS 值没有那么敏感。

# optimistic exploration greedy¶

In [596]:
# optimistic exploration：直到 “失望” 前，一直探索，然后贪心

def opt_greedy(init_q):
total_reward = 0
is_best = 0
epson_num = 0
try_reward = []
trial_num = 0
ite = 0
total_iter = 1000
Q_table = [init_q] * len(BANDITS)
indexes = range(0, len(BANDITS))
while ite < total_iter:
# 当 最大值 是初始乐观值时，一直保持探索（不死心……）
if max(Q_table) == init_q:
trial_num += 1
ite += 1
choose = indexes[np.random.randint(0, len(indexes))]
reward = round(pull_bandit(choose), 1)
total_reward += reward

# 更新 Q table
Q_table[choose] = round((Q_table[choose] + reward)/2, 1)
# 更新 indexes
indexes = [i for i in range(len(Q_table)) if Q_table[i]==init_q]

else:
ite += 1
choose = Q_table.index(max(Q_table))
reward = round(pull_bandit(choose), 1)
total_reward += reward

print("尝试次数：%s" % trial_num)
print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[choose]))
print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, max(Q_table), choose+1))
print("实际中的 Q_table: %s\n" % BANDITS)
if choose == BANDITS.index(max(BANDITS)):
print("You have chosen the best bandit!")
is_best = 1
else:
print("You haven't chosen the best bandit....")
is_best = 0


In [597]:
opt_greedy(10)

尝试次数：10

You have chosen the best bandit!

Out[597]:
(10080.3, 1)

optimistic exploration 小结：

• 非常重要：结果与 BANDITS 数据分布关系极大，当分布方差极大时，很容易收敛到绝对值比较小的期望上。
• 这点很容易理解，因为方差大（比如 2 倍均值），所以期望大的老虎机总会获得到一个很大的负值 Q，取平均就会导致 Q 变小，结果就再也没有机会选到这台老虎机。
• 相对来说，上面的这种情况在 epson-贪心 和 贪心算法上就会好很多。

# upper-confidence bound¶

In [633]:
# upper-confidence bound

def ucb(c):
total_reward = 0
is_best = 0
try_reward = []
ite = 0
total_iter = 1000
Q_table_choose = [0] * len(BANDITS)
Q_table_true = [0] * len(BANDITS)
N_table = [0] * len(BANDITS)
while ite < total_iter:
ite += 1
# 控制探索次数
# 也可以用 epson 来控制，epson 满足条件时探索，其他时间贪心
if ite <= 100:
choose = np.random.randint(0,len(BANDITS))
rew = pull_bandit(choose)
total_reward += rew
N_table[choose] += 1
Q_table_choose[choose] = round(rew, 2) + c * np.sqrt(np.log(ite)/N_table[choose])
Q_table_true[choose] = round(rew, 2)
index = Q_table_choose.index(max(Q_table_choose))
Q = Q_table_true[index]
else:
total_reward += Q

print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[index]))
print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1))
print("实际中的 Q_table: %s\n" % BANDITS)
if index == BANDITS.index(max(BANDITS)):
print("You have chosen the best bandit!")
is_best = 1
else:
print("You haven't chosen the best bandit....")
is_best = 0


In [626]:
ucb(0.1)

获得的总收益：17448，你选择的老虎机期望收益：10.0...

You have chosen the best bandit!

Out[626]:
(17448.197086410153, 1)

In [480]:
def softmax(xlist):
ex = np.exp(xlist - np.max(xlist))
return ex/np.sum(ex, axis=0)

In [728]:
# gradient
# 不是特别确定理解是否有误
is_best = 0
H_table = np.random.rand(len(BANDITS))
pi = softmax(H_table)
ite = 0
total_iter = 1000
ht = 0
total_reward = 0
while ite < total_iter:
ite += 1
# 随机投掷，看 pi 分布下哪个最高就选择哪个
throw = np.random.multinomial(100, pi)
choose = list(throw).index(max(throw))
rew = pull_bandit(choose)
total_reward += rew
if rew > 0:
htnext = ht + alpha * rew * (1 - pi[choose])
else:
htnext = ht - alpha * rew * pi[choose]
H_table[choose] = htnext
ht = htnext
pi = softmax(H_table)

print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[choose]))
if list(pi).index(max(pi)) == BANDITS.index(max(BANDITS)):
print("You have chosen the best bandit!")
is_best = 1
else:
print("You haven't chosen the best bandit....")
is_best = 0


In [730]:
gradient(0.1)

获得的总收益：9931，你选择的老虎机期望收益：10.0...

You have chosen the best bandit!

Out[730]:
(9931.761125811887, 1)

# All Together¶

In [ ]:
best_list = []
aver_reward = []
# 重复 100 次实验
N = 100
x = np.logspace(-7, 2, num=10, base=2)

ed_best, oeg_best, ucb_best, gra_best = 0, 0, 0, 0
ed_rew, oeg_rew, ucb_rew, gra_rew = 0, 0, 0, 0
for i in range(N):
ed_best += egb
ed_rew += edr
oeg_best += oegb
oeg_rew += oegr
ucb_best += ucbb
ucb_rew += ucbr
gra_best += grab
gra_rew += grar
best_list.append([ed_best/N, oeg_best/N, ucb_best/N, gra_best/N])
aver_reward.append([ed_rew/N, oeg_rew/N, ucb_rew/N, gra_rew/N])

In [732]:
best_list = np.array(best_list)
aver_reward = np.array(aver_reward)
y_edb, y_oegb, y_ucbb, y_grab = best_list[:,0], best_list[:,1], best_list[:,2], best_list[:,3]
y_edr, y_oegr, y_ucbr, y_grar = aver_reward[:,0], aver_reward[:,1], aver_reward[:,2], aver_reward[:,3]

plt.figure(figsize=(12, 8))
plt.subplots_adjust(left=.125, right=.9, bottom=.1, top=.9, wspace=.1, hspace=.3)

plt.subplot(2,1,1)
plt.title('All Bandit Algorithm Results Best')
plt.plot(x, y_edb, color='black', label='epson-greedy')
plt.plot(x, y_oegb, color='red', label='optimistic exploration greedy')
plt.plot(x, y_ucbb,  color='skyblue', label='upper-confidence bound')
plt.legend()
plt.ylabel('probability of getting the best bandit')

plt.subplot(2,1,2)
plt.title('All Bandit Algorithm Results Avereward')
plt.plot(x, y_edr, color='black', label='epson-greedy')
plt.plot(x, y_oegr, color='red', label='optimistic exploration greedy')
plt.plot(x, y_ucbr,  color='skyblue', label='upper-confidence bound')
plt.legend()
plt.ylabel('average reward')

plt.show()


• BANDIT 初始期望设定
• 各算法涉及的其他初始值设定
• 其他代码细节

In [ ]: