import pandas as pd import matplotlib.pyplot as plt import matplotlib.mlab as mlab import numpy.random as rand import numpy as np %matplotlib inline from scipy.stats import beta import random as random import scipy.integrate as integrate import math as math import time from bokeh.plotting import * from bokeh.objects import Glyph from bokeh.objects import Range1d output_notebook(url="default") def betaPlot(prob = 0.5, N = 100): figure(tools="", title = 'Beta Distribution', x_axis_label = 'Success Probability', y_axis_label = 'PDF', x_range=Range1d(start=0, end=1)) hold() a = 1 b = 1 x = np.linspace(0, 1, 100) line(x, beta.pdf(x,a,b), color="#2E2EFE", plot_width=800, plot_height=500, legend='Beta Posterior') line([prob,prob],[0,10], color="#FE2E2E", line_dash = 'dashed', legend='True Mean') grid().grid_line_alpha=0.3 renderer1 = [r for r in curplot().renderers if isinstance(r, Glyph)][0] ds1 = renderer1.data_source show() for i in range(N): test = rand.rand() < 0.5 if i < 6: print 'a = %s and b = %s' %(a,b) raw_input("") print 'Payoff: %s' %test.real a += test b += not(test) ds1.data["x"] = x ds1.data["y"] = beta.pdf(x,a,b) ds1._dirty = True cursession().store_objects(ds1) time.sleep(0.1) betaPlot() class MAB(object): def __init__(self,numArms): self.armPayoffs = rand.uniform(0,1,numArms) self.successes = np.ones(numArms) self.failures = np.ones(numArms) def pullArm(self,arm): test = rand.uniform() result = test < self.armPayoffs[arm] bestResult = test < max(self.armPayoffs) regret = bestResult - result self.successes[arm] += result self.failures[arm] += not(result) return result, regret def strategyStratifiedSample(mab): samples = mab.successes + mab.failures bestEstArm = samples == min(samples) return random.sample(np.where(bestEstArm)[0], 1)[0] def mab3Plot(strategy, pulls = 100): figure(tools="", title = 'Multi-Armed Bandit', x_axis_label = 'Success Probability', y_axis_label = 'PDF', x_range=Range1d(start=0, end=1)) hold() mab = MAB(3) x = np.linspace(0, 1, 100) line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#2E2EFE", plot_width=800, plot_height=500, legend='Bandit 1 Posterior') line([mab.armPayoffs[0],mab.armPayoffs[0]],[0,10], color="#2E2EFE", line_dash = 'dashed', legend='Bandit 1 Truth') line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#AC58FA", plot_width=800, plot_height=500, legend='Bandit 2 Posterior') line([mab.armPayoffs[1],mab.armPayoffs[1]],[0,10], color="#AC58FA", line_dash = 'dashed', legend='Bandit 2 Truth') line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#FE642E", plot_width=800, plot_height=500, legend='Bandit 3 Posterior') line([mab.armPayoffs[2],mab.armPayoffs[2]],[0,10], color="#FE642E", line_dash = 'dashed', legend='Bandit 3 Truth') grid().grid_line_alpha=0.3 renderer0 = [r for r in curplot().renderers if isinstance(r, Glyph)][0] ds0 = renderer0.data_source renderer2 = [r for r in curplot().renderers if isinstance(r, Glyph)][2] ds2 = renderer2.data_source renderer4 = [r for r in curplot().renderers if isinstance(r, Glyph)][4] ds4 = renderer4.data_source show() for i in range(pulls): arm = strategy(mab) result, regret = mab.pullArm(arm) ds0.data["x"] = x ds0.data["y"] = beta.pdf(x,mab.successes[0],mab.failures[0]) ds0._dirty = True ds2.data["x"] = x ds2.data["y"] = beta.pdf(x,mab.successes[1],mab.failures[1]) ds2._dirty = True ds4.data["x"] = x ds4.data["y"] = beta.pdf(x,mab.successes[2],mab.failures[2]) ds4._dirty = True cursession().store_objects(ds0) cursession().store_objects(ds2) cursession().store_objects(ds4) time.sleep(0.01) mab3Plot(strategy=strategyStratifiedSample) def stratifiedRegret(cdata): figure(tools="", title = 'Multi-Armed Bandit Regret', x_axis_label = 'Iteration', y_axis_label = 'Regret', x_range=Range1d(start=0, end=pulls), y_range=Range1d(start=0, end=max(cdata))) hold() line(range(pulls), cdata, color="#FF4000", plot_width=800, plot_height=500, legend='Stratified Sample') grid().grid_line_alpha=0.3 legend()[0].orientation = "top_left" show() mab = MAB(3) pulls = 1000 data = np.zeros(pulls) for i in range(pulls): arm = strategyStratifiedSample(mab) result, regret = mab.pullArm(arm) data[i] = regret cdata = np.cumsum(data) print mab.armPayoffs stratifiedRegret(cdata) def strategyGreedy(mab): rates = mab.successes/(mab.successes + mab.failures) bestEstArm = rates == max(rates) return random.sample(np.where(bestEstArm)[0], 1)[0] mab3Plot(strategy=strategyGreedy) def strategyEpsilonGreedy(mab, epsilon = 0.1): if random.random() < epsilon: return random.randint(0,len(mab.successes)-1) else: return strategyGreedy(mab) def strategyEpsilonFirst(mab, pulls = 100, epsilon = 0.1): trials = pulls*epsilon samples = mab.successes + mab.failures if sum(samples) < trials: return strategyStratifiedSample(mab) else: return strategyGreedy(mab) mab3Plot(strategy=strategyEpsilonFirst) def basicRegretPlot(data): randBandit = np.cumsum(data.StratifiedSample.astype(float)) greedyBandit = np.cumsum(data.Greedy.astype(float)) epsGreedyBandit = np.cumsum(data.EpsilonGreedy.astype(float)) epsFirstBandit = np.cumsum(data.EpsilonFirst.astype(float)) figure(tools="", title = 'Multi-Armed Bandit Regret', x_axis_label = 'Iteration', y_axis_label = 'Regret', x_range=Range1d(start=0, end=len(data)), y_range=Range1d(start=0, end=max(randBandit))) hold() line(range(len(randBandit)), randBandit, color="#FF4000", plot_width=800, plot_height=500, legend='Stratified Sample') line(range(len(randBandit)), greedyBandit, color="#AC58FA", plot_width=800, plot_height=500, legend='Greedy') line(range(len(randBandit)), epsGreedyBandit, color="#0080FF", plot_width=800, plot_height=500, legend='Epsilon Greedy') line(range(len(randBandit)), epsFirstBandit, color="#01DF01", plot_width=800, plot_height=500, legend='Epsilon First') grid().grid_line_alpha=0.3 legend()[0].orientation = "top_left" show() class MABs(object): def __init__(self,armPayoffs): self.armPayoffs = armPayoffs self.successes = np.ones(len(armPayoffs)) self.failures = np.ones(len(armPayoffs)) def pullArm(self,arm,test): result = test < self.armPayoffs[arm] bestResult = test < max(self.armPayoffs) regret = bestResult - result self.successes[arm] += result self.failures[arm] += not(result) return result, regret def genBasicData(numArms = 3, pulls = 100): armPayoffs = rand.uniform(0,1,numArms) mab0 = MABs(armPayoffs) mab1 = MABs(armPayoffs) mab2 = MABs(armPayoffs) mab3 = MABs(armPayoffs) cols = ['StratifiedSample','Greedy','EpsilonGreedy','EpsilonFirst'] data = pd.DataFrame(columns=cols) for i in range(pulls): test = rand.uniform() arm0 = strategyStratifiedSample(mab0) result0, regret0 = mab0.pullArm(arm0, test) arm1 = strategyGreedy(mab1) result1, regret1 = mab1.pullArm(arm1, test) arm2 = strategyEpsilonGreedy(mab2, 0.1) result2, regret2 = mab2.pullArm(arm2, test) arm3 = strategyEpsilonFirst(mab3, pulls, 0.1) result3, regret3 = mab3.pullArm(arm3, test) row = pd.DataFrame([dict(StratifiedSample = regret0, Greedy = regret1, EpsilonGreedy = regret2, EpsilonFirst = regret3)]) data = data.append(row, ignore_index=True) #print mab0.armPayoffs return data data = genBasicData() basicRegretPlot(data) def basicSimulation(sims = 100): data = genBasicData() for i in range(1,sims): data += genBasicData() return data/100.0 data = basicSimulation() basicRegretPlot(data) def bpmProbs(mab): y = mab.successes-1 n = mab.successes + mab.failures - 2 k = len(y) ans = np.zeros(k) for i in range(k): indx = range(k) del indx[i] def f(x): r = beta.pdf(x, y[i]+1, n[i]-y[i]+1) for j in indx: r = r * beta.cdf(x, y[j]+1, n[j]-y[j]+1) return r ans[i] = integrate.quad(f,0,1)[0] return ans def weightedSample(weights): '''Return the index to sample based on weight vector''' r = random.random() cweight = np.cumsum(weights) choice = min(cweight[np.where(cweight >= r)]) return np.where(cweight == choice)[0][0] pd.Series([weightedSample([0.333,0.333,0.334]) for _ in range(10000)]).hist(figsize=(12,7)) #pd.Series([weightedSample([0.8,0.1,0.1]) for _ in range(10000)]).hist(figsize=(12,7)) def StrategyBPM(mab): weights = bpmProbs(mab) return weightedSample(weights) mab3Plot(strategy=StrategyBPM) def genBasicData2(numArms = 3, pulls = 100): armPayoffs = rand.uniform(0,1,numArms) mab0 = MABs(armPayoffs) mab1 = MABs(armPayoffs) mab2 = MABs(armPayoffs) mab3 = MABs(armPayoffs) mab4 = MABs(armPayoffs) cols = ['StratifiedSample','Greedy','EpsilonGreedy','EpsilonFirst','BPM'] data = pd.DataFrame(columns=cols) for i in range(pulls): test = rand.uniform() arm0 = strategyStratifiedSample(mab0) result0, regret0 = mab0.pullArm(arm0, test) arm1 = strategyGreedy(mab1) result1, regret1 = mab1.pullArm(arm1, test) arm2 = strategyEpsilonGreedy(mab2, 0.1) result2, regret2 = mab2.pullArm(arm2, test) arm3 = strategyEpsilonFirst(mab3, pulls, 0.1) result3, regret3 = mab3.pullArm(arm3, test) arm4 = StrategyBPM(mab4) result4, regret4 = mab4.pullArm(arm4, test) row = pd.DataFrame([dict(StratifiedSample = regret0, Greedy = regret1, EpsilonGreedy = regret2, EpsilonFirst = regret3, BPM = regret4)]) data = data.append(row, ignore_index=True) #print mab0.armPayoffs return data def basicSimulation2(sims = 100): data = genBasicData2() dataseries = [] dataseries.append(data.sum()) for i in range(1,sims): d = genBasicData2() data += d dataseries.append(d.sum()) return data/100.0, dataseries def basicRegretPlot2(data): randBandit = np.cumsum(data.StratifiedSample.astype(float)) greedyBandit = np.cumsum(data.Greedy.astype(float)) epsGreedyBandit = np.cumsum(data.EpsilonGreedy.astype(float)) epsFirstBandit = np.cumsum(data.EpsilonFirst.astype(float)) BPMBandit = np.cumsum(data.BPM.astype(float)) figure(tools="", title = 'Multi-Armed Bandit Regret', x_axis_label = 'Iteration', y_axis_label = 'Regret', x_range=Range1d(start=0, end=len(data)), y_range=Range1d(start=0, end=max(randBandit))) hold() line(range(len(randBandit)), randBandit, color="#FF4000", plot_width=800, plot_height=500, legend='Stratified Sample') line(range(len(randBandit)), greedyBandit, color="#AC58FA", plot_width=800, plot_height=500, legend='Greedy') line(range(len(randBandit)), epsGreedyBandit, color="#0080FF", plot_width=800, plot_height=500, legend='Epsilon Greedy') line(range(len(randBandit)), epsFirstBandit, color="#01DF01", plot_width=800, plot_height=500, legend='Epsilon First') line(range(len(randBandit)), BPMBandit, color="#FF0000", plot_width=800, plot_height=500, legend='BPM') grid().grid_line_alpha=0.3 legend()[0].orientation = "top_left" show() data, dataseries = basicSimulation2() basicRegretPlot2(data) pd.concat(dataseries,axis=1).mean(axis=1) pd.concat(dataseries,axis=1).var(axis=1) def genBasicData3(numArms = 3, pulls = 100): armPayoffs = rand.uniform(0,1,numArms) mab0 = MABs(armPayoffs) mab1 = MABs(armPayoffs) mab2 = MABs(armPayoffs) mab3 = MABs(armPayoffs) mab4 = MABs(armPayoffs) cols = ['StratifiedSample','Greedy','EpsilonGreedy','EpsilonFirst','BPM'] data = pd.DataFrame(columns=cols) for i in range(pulls): if i == pulls/2: mab0.armPayoffs = mab1.armPayoffs = mab2.armPayoffs = \ mab3.armPayoffs = mab4.armPayoffs = rand.uniform(0,1,numArms) test = rand.uniform() arm0 = strategyStratifiedSample(mab0) result0, regret0 = mab0.pullArm(arm0, test) arm1 = strategyGreedy(mab1) result1, regret1 = mab1.pullArm(arm1, test) arm2 = strategyEpsilonGreedy(mab2, 0.1) result2, regret2 = mab2.pullArm(arm2, test) arm3 = strategyEpsilonFirst(mab3, pulls, 0.1) result3, regret3 = mab3.pullArm(arm3, test) arm4 = StrategyBPM(mab4) result4, regret4 = mab4.pullArm(arm4, test) row = pd.DataFrame([dict(StratifiedSample = regret0, Greedy = regret1, EpsilonGreedy = regret2, EpsilonFirst = regret3, BPM = regret4)]) data = data.append(row, ignore_index=True) #print mab0.armPayoffs return data def basicSimulation3(sims = 100): data = genBasicData3() for i in range(1,sims): data += genBasicData3() return data/100.0 data = basicSimulation3() basicRegretPlot2(data) def mab3PlotVarying(strategy, pulls = 100): figure(tools="", title = 'Multi-Armed Bandit', x_axis_label = 'Success Probability', y_axis_label = 'PDF', x_range=Range1d(start=0, end=1)) hold() armPayoffs = np.array([0, 0.5, 0]) x1 = np.linspace(0, 1.5, 100) payoff0 = map(math.sin, x1) payoff2 = map(math.cos, x1) armPayoffs[0] = payoff0[0] armPayoffs[2] = payoff2[0] mab = MABs(armPayoffs) x = np.linspace(0, 1, 100) line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#2E2EFE", plot_width=800, plot_height=500, legend='Bandit 1 Posterior') line([mab.armPayoffs[0],mab.armPayoffs[0]],[0,10], color="#2E2EFE", line_dash = 'dashed', legend='Bandit 1 Truth') line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#AC58FA", plot_width=800, plot_height=500, legend='Bandit 2 Posterior') line([mab.armPayoffs[1],mab.armPayoffs[1]],[0,10], color="#AC58FA", line_dash = 'dashed', legend='Bandit 2 Truth') line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#FE642E", plot_width=800, plot_height=500, legend='Bandit 3 Posterior') line([mab.armPayoffs[2],mab.armPayoffs[2]],[0,10], color="#FE642E", line_dash = 'dashed', legend='Bandit 3 Truth') grid().grid_line_alpha=0.3 renderer0 = [r for r in curplot().renderers if isinstance(r, Glyph)][0] ds0 = renderer0.data_source renderer1 = [r for r in curplot().renderers if isinstance(r, Glyph)][1] ds1 = renderer1.data_source renderer2 = [r for r in curplot().renderers if isinstance(r, Glyph)][2] ds2 = renderer2.data_source renderer3 = [r for r in curplot().renderers if isinstance(r, Glyph)][3] ds3 = renderer3.data_source renderer4 = [r for r in curplot().renderers if isinstance(r, Glyph)][4] ds4 = renderer4.data_source renderer5 = [r for r in curplot().renderers if isinstance(r, Glyph)][5] ds5 = renderer5.data_source show() for i in range(pulls): test = rand.uniform() armPayoffs[0] = payoff0[i] armPayoffs[2] = payoff2[i] mab.armPayoffs = armPayoffs arm = strategy(mab) result, regret = mab.pullArm(arm, test) ds0.data["x"] = x ds0.data["y"] = beta.pdf(x,mab.successes[0],mab.failures[0]) ds0._dirty = True ds1.data["x"] = [payoff0[i],payoff0[i]] ds1.data["y"] = [0,10] ds1._dirty = True ds2.data["x"] = x ds2.data["y"] = beta.pdf(x,mab.successes[1],mab.failures[1]) ds2._dirty = True ds3.data["x"] = [0.5,0.5] ds3.data["y"] = [0,10] ds3._dirty = True ds4.data["x"] = x ds4.data["y"] = beta.pdf(x,mab.successes[2],mab.failures[2]) ds4._dirty = True ds5.data["x"] = [payoff2[i],payoff2[i]] ds5.data["y"] = [0,10] ds5._dirty = True cursession().store_objects(ds0) cursession().store_objects(ds1) cursession().store_objects(ds2) cursession().store_objects(ds3) cursession().store_objects(ds4) cursession().store_objects(ds5) time.sleep(0.01) mab3PlotVarying(strategy=StrategyBPM) class MABdecay(object): def __init__(self,armPayoffs,decay): self.armPayoffs = armPayoffs self.successes = np.ones(len(armPayoffs)) self.failures = np.ones(len(armPayoffs)) self.decay = decay def pullArm(self,arm,test): result = test < self.armPayoffs[arm] bestResult = test < max(self.armPayoffs) regret = bestResult - result self.successes[arm] += result self.failures[arm] += not(result) #decay self.successes[arm] = max(1, (1-self.decay)*self.successes[arm]) self.failures[arm] = max(1, (1-self.decay)*self.failures[arm]) return result, regret def mab3PlotVarying2(strategy, pulls = 100, decay = 0.1): figure(tools="", title = 'Multi-Armed Bandit', x_axis_label = 'Success Probability', y_axis_label = 'PDF', x_range=Range1d(start=0, end=1)) hold() armPayoffs = np.array([0, 0.5, 0]) x1 = np.linspace(0, 1.5, 100) payoff0 = map(math.sin, x1) payoff2 = map(math.cos, x1) armPayoffs[0] = payoff0[0] armPayoffs[2] = payoff2[0] mab = MABdecay(armPayoffs, decay) x = np.linspace(0, 1, 100) line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#2E2EFE", plot_width=800, plot_height=500, legend='Bandit 1 Posterior') line([mab.armPayoffs[0],mab.armPayoffs[0]],[0,10], color="#2E2EFE", line_dash = 'dashed', legend='Bandit 1 Truth') line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#AC58FA", plot_width=800, plot_height=500, legend='Bandit 2 Posterior') line([mab.armPayoffs[1],mab.armPayoffs[1]],[0,10], color="#AC58FA", line_dash = 'dashed', legend='Bandit 2 Truth') line(x, beta.pdf(x,mab.successes[0],mab.failures[0]), color="#FE642E", plot_width=800, plot_height=500, legend='Bandit 3 Posterior') line([mab.armPayoffs[2],mab.armPayoffs[2]],[0,10], color="#FE642E", line_dash = 'dashed', legend='Bandit 3 Truth') grid().grid_line_alpha=0.3 renderer0 = [r for r in curplot().renderers if isinstance(r, Glyph)][0] ds0 = renderer0.data_source renderer1 = [r for r in curplot().renderers if isinstance(r, Glyph)][1] ds1 = renderer1.data_source renderer2 = [r for r in curplot().renderers if isinstance(r, Glyph)][2] ds2 = renderer2.data_source renderer3 = [r for r in curplot().renderers if isinstance(r, Glyph)][3] ds3 = renderer3.data_source renderer4 = [r for r in curplot().renderers if isinstance(r, Glyph)][4] ds4 = renderer4.data_source renderer5 = [r for r in curplot().renderers if isinstance(r, Glyph)][5] ds5 = renderer5.data_source show() for i in range(pulls): test = rand.uniform() armPayoffs[0] = payoff0[i] armPayoffs[2] = payoff2[i] mab.armPayoffs = armPayoffs arm = strategy(mab) result, regret = mab.pullArm(arm, test) ds0.data["x"] = x ds0.data["y"] = beta.pdf(x,mab.successes[0],mab.failures[0]) ds0._dirty = True ds1.data["x"] = [payoff0[i],payoff0[i]] ds1.data["y"] = [0,10] ds1._dirty = True ds2.data["x"] = x ds2.data["y"] = beta.pdf(x,mab.successes[1],mab.failures[1]) ds2._dirty = True ds3.data["x"] = [0.5,0.5] ds3.data["y"] = [0,10] ds3._dirty = True ds4.data["x"] = x ds4.data["y"] = beta.pdf(x,mab.successes[2],mab.failures[2]) ds4._dirty = True ds5.data["x"] = [payoff2[i],payoff2[i]] ds5.data["y"] = [0,10] ds5._dirty = True cursession().store_objects(ds0) cursession().store_objects(ds1) cursession().store_objects(ds2) cursession().store_objects(ds3) cursession().store_objects(ds4) cursession().store_objects(ds5) time.sleep(0.01) mab3PlotVarying2(strategy=StrategyBPM) def genBasicData4(numArms = 3, pulls = 100, decay = 0.1): armPayoffs = [0, 0.5, 0] x = np.linspace(0, 2, 100) payoff0 = map(math.sin, x) payoff2 = map(math.cos, x) armPayoffs[0] = payoff0[0] armPayoffs[2] = payoff2[0] mab0 = MABdecay(armPayoffs, decay = 0.1) mab1 = MABdecay(armPayoffs, decay = 0.1) mab2 = MABdecay(armPayoffs, decay = 0.1) mab3 = MABdecay(armPayoffs, decay = 0.1) mab4 = MABdecay(armPayoffs, decay = 0.1) cols = ['StratifiedSample','Greedy','EpsilonGreedy','EpsilonFirst','BPM'] data = pd.DataFrame(columns=cols) for i in range(pulls): armPayoffs[0] = payoff0[i] armPayoffs[2] = payoff2[i] mab0.armPayoffs = mab1.armPayoffs = mab2.armPayoffs = \ mab3.armPayoffs = mab4.armPayoffs = armPayoffs test = rand.uniform() arm0 = strategyStratifiedSample(mab0) result0, regret0 = mab0.pullArm(arm0, test) arm1 = strategyGreedy(mab1) result1, regret1 = mab1.pullArm(arm1, test) arm2 = strategyEpsilonGreedy(mab2, 0.1) result2, regret2 = mab2.pullArm(arm2, test) arm3 = strategyEpsilonFirst(mab3, pulls, 0.1) result3, regret3 = mab3.pullArm(arm3, test) arm4 = StrategyBPM(mab4) result4, regret4 = mab4.pullArm(arm4, test) row = pd.DataFrame([dict(StratifiedSample = regret0, Greedy = regret1, EpsilonGreedy = regret2, EpsilonFirst = regret3, BPM = regret4)]) data = data.append(row, ignore_index=True) return data def basicSimulation4(sims = 100): data = genBasicData4() dataseries = [] dataseries.append(data.sum()) for i in range(1,sims): d = genBasicData4() data += d dataseries.append(d.sum()) return data/100.0, dataseries data, dataseries = basicSimulation4() basicRegretPlot2(data) pd.concat(dataseries,axis=1).mean(axis=1) pd.concat(dataseries,axis=1).var(axis=1)