實現了一個multiarm bandit的代碼,主要實現的算法:-greedy和UCB,Thompson Sampling。UCB的選擇依據爲:
是個tuning factor,在代碼中實現爲觀測窗口內的最大值。因爲我模擬的arm產生的reward不是二值的(0或者1),而是根據瑞利分佈[5]生成的隨機數。關於這個,可以看看[4]的理論。
Thompson Sampling,有兩個關於beta分佈參數(a,b)。只有當獲取的獎勵小於各個arm的平均獎勵時,b值增加1,否則a增加1。這是我想到的辦法,沒啥理論。
sum=0;
for i in range(arms):
sum+=self.avergeRewards[i][1]
measure=sum/arms
a=self.ab[choice][0]
b=self.ab[choice][1]
if v>measure:
a=a+1
self.ab[choice]=(a,b)
else:
b=b+1
self.ab[choice]=(a,b)
代碼 mab.py
import os
import random
import math
import numpy as np
class Arm(object):
def getReward(self):
return 0
def getID(self):
return 0
def Update(self,slot):
0
class RayleighArm(Arm):
def __init__(self,id,seed,sigmas):
self.rand=random.Random()
self.rand.seed(seed)
self.sigmarand=random.Random()
self.sigmarand.seed(seed)
self.id=id
self.sigmas=[]
for i in range(len(sigmas)):
self.sigmas.append(sigmas[i])
self.update_counter=0
i=self.sigmarand.randint(0,3000)%len(self.sigmas)
self.sigma=self.sigmas[i]
self.slot=0
self.value=0;
self.generateV()
def generateV(self):
r=self.rand.uniform(0,1)
self.value=self.sigma*math.sqrt(-2 * math.log(r))
self.update_counter+=1
if self.update_counter>100:
self.update_counter=0
i=self.sigmarand.randint(0,3000)%len(self.sigmas)
self.sigma=self.sigmas[i]
def getReward(self):
return self.value;
def getID(self):
return self.id
def Update(self,slot):
if slot>self.slot:
self.generateV()
self.slot=slot
class Bandit(object):
def __init__(self):
self.arms=[]
def RegisterArm(self,arm):
items=len(self.arms)
if items:
exist=False
for i in range(items):
if arm.getID()==self.arms[i].getID():
exist=True
break
if not exist:
self.arms.append(arm)
else:
self.arms.append(arm)
def getArmsSize(self):
return len(self.arms)
def getValueOfArm(self,i):
return self.arms[i].getReward()
def getOptimal(self):
items=len(self.arms)
max=self.arms[0].getReward()
for i in range(items-1):
v=self.arms[i+1].getReward()
if(v>max):
max=v
return max
def getRewardAndRegret(self):
return 0,0,0
class EpsilonBandit(Bandit):
def __init__(self,seed,epsilon):
Bandit.__init__(self)
self.rand=random.Random()
self.randint=random.Random()
self.rand.seed(seed)
self.randint.seed(seed);
self.epsilon=epsilon
self.avergeRewards=[]
def RegisterArm(self,arm):
Bandit.RegisterArm(self,arm)
def getValueOfArm(self,i):
return Bandit.getValueOfArm(self,i)
def getRewardAndRegret(self):
reward=0
regret=0
arms=self.getArmsSize()
optimal=self.getOptimal()
pull=0
if not len(self.avergeRewards):
reward=optimal
largest=self.getValueOfArm(0)
for i in range(arms):
v=self.getValueOfArm(i)
self.avergeRewards.append((1,v))
if v>largest:
largest=v
pull=i
else:
pull=self.decision()
reward=self.getValueOfArm(pull)
regret=optimal-reward
return reward,regret,pull
def decision(self):
e=self.rand.uniform(0,1)
arms=self.getArmsSize()
choice=0
if e<self.epsilon and arms>1:
choice=self.randint.randint(0,1000)%arms
else:
largest=self.avergeRewards[0][1]
for i in range(arms-1):
if self.avergeRewards[i+1][1]>largest:
choice=i+1
largest=self.avergeRewards[i+1][1]
v=self.getValueOfArm(choice)
pull=self.avergeRewards[choice][0]
average=(self.avergeRewards[choice][1]*pull+v)/(pull+1)
self.avergeRewards[choice]=(pull+1,average)
return choice
class UCBBandit(Bandit):
def __init__(self,alpha,window):
Bandit.__init__(self)
self.alpha=alpha
self.avergeRewards=[]
self.T=1
self.window=window
self.windowRecord=[]
def getWindowMaxOfArm(self,arm):
max=0
items=len(self.windowRecord[arm])
if not items:
max=self.windowRecord[arm][0]
for i in range(items):
if self.windowRecord[arm][i]>max:
max=self.windowRecord[arm][i]
return max
def UpdateWindowSample(self,arm,sample):
self.windowRecord[arm].append(sample)
items=len(self.windowRecord[arm])
new=[]
if items>self.window:
for i in range(self.window):
new.append(self.windowRecord[arm][items-self.window+i])
self.windowRecord[arm]=new
def RegisterArm(self,arm):
Bandit.RegisterArm(self,arm)
def getValueOfArm(self,i):
return Bandit.getValueOfArm(self,i)
def getRewardAndRegret(self):
reward=0
regret=0
arms=self.getArmsSize()
optimal=self.getOptimal()
pull=0
if not len(self.avergeRewards):
reward=optimal
largest=self.getValueOfArm(0)
for i in range(arms):
v=self.getValueOfArm(i)
a=[]
a.append(v)
self.windowRecord.append(a)
self.avergeRewards.append((1,v))
if v>largest:
largest=v
pull=i
else:
pull=self.decision()
reward=self.getValueOfArm(pull)
regret=optimal-reward
self.UpdateWindowSample(pull,reward)
self.T+=1
return reward,regret,pull
def decision(self):
arms=self.getArmsSize()
choice=0
measure=[]
for i in range(arms):
N=self.avergeRewards[i][0]
tun=self.getWindowMaxOfArm(i)
m=self.avergeRewards[i][1]+tun*math.sqrt(2*math.log(self.T)/N)
measure.append(m)
max_m=measure[0]
for i in range(arms-1):
if measure[i+1]>max_m:
max_m=measure[i+1]
choice=i+1
v=self.getValueOfArm(choice)
pull=self.avergeRewards[choice][0]
#average=(self.avergeRewards[choice][1]*pull+v)/(pull+1)
#self.avergeRewards[choice]=(pull+1,average)
smooth=self.avergeRewards[choice][1]*(1-self.alpha)+(self.alpha)*v
self.avergeRewards[choice]=(pull+1,smooth)
return choice
#https://visualstudiomagazine.com/articles/2019/06/01/thompson-sampling.aspx
class Thompson(Bandit):
def __init__(self,seed):
Bandit.__init__(self)
self.rand=np.random.RandomState(seed)
self.ab=[]
self.avergeRewards=[]
def RegisterArm(self,arm,a,b):
Bandit.RegisterArm(self,arm)
self.ab.append((a,b))
def getValueOfArm(self,i):
return Bandit.getValueOfArm(self,i)
def getRewardAndRegret(self):
reward=0
regret=0
arms=self.getArmsSize()
optimal=self.getOptimal()
choice=0
if not len(self.avergeRewards):
reward=optimal
largest=self.getValueOfArm(0)
for i in range(arms):
v=self.getValueOfArm(i)
self.avergeRewards.append((1,v))
if v>largest:
largest=v
choice=i
else:
choice=self.decision()
reward=self.getValueOfArm(choice)
regret=optimal-reward
return reward,regret,choice
def decision(self):
choice=0
arms=self.getArmsSize()
max=self.rand.beta(self.ab[choice][0]+1,self.ab[choice][1]+1)
for i in range(arms-1):
temp=self.rand.beta(self.ab[i+1][0]+1,self.ab[i+1][1]+1)
if temp>max:
max=temp
choice=i+1
v=self.getValueOfArm(choice)
pull=self.avergeRewards[choice][0]
average=(self.avergeRewards[choice][1]*pull+v)/(pull+1)
self.avergeRewards[choice]=(pull+1,average)
sum=0;
for i in range(arms):
sum+=self.avergeRewards[i][1]
measure=sum/arms
a=self.ab[choice][0]
b=self.ab[choice][1]
if v>=measure:
a=a+1
self.ab[choice]=(a,b)
else:
b=b+1
self.ab[choice]=(a,b)
return choice
totalExperiment=2000
#averge=sigma*math.sqrt(math.pi/2)
sigma1=[2]
id=1
arms=[]
eBandit=EpsilonBandit(12373,0.2)
ucbBandit=UCBBandit(0.8,10)
tomBandit=Thompson(12323)
arm=RayleighArm(id,12323,sigma1)
id=id+1
eBandit.RegisterArm(arm);
ucbBandit.RegisterArm(arm);
tomBandit.RegisterArm(arm,50,50)
arms.append(arm)
sigma2=[5]
arm=RayleighArm(id,12446,sigma2)
id=id+1
eBandit.RegisterArm(arm);
ucbBandit.RegisterArm(arm);
tomBandit.RegisterArm(arm,50,50)
arms.append(arm)
eRegret=0.0
uRegret=0.0
tomRegret=0.0
f_epsilon=open("epsilon.txt",'w')
f_ucb=open("ucb.txt",'w')
f_tom=open("tom.txt",'w')
for slot in range(totalExperiment):
for j in range(len(arms)):
arms[j].Update(slot)
reward,regret,pull=eBandit.getRewardAndRegret()
eRegret+=regret
f_epsilon.write(str(slot)+"\t"+str(reward)+"\t"+str(eRegret)+"\t"+str(pull)
+"\t"+str(eBandit.getValueOfArm(pull))+"\n")
reward,regret,pull=ucbBandit.getRewardAndRegret()
uRegret+=regret
f_ucb.write(str(slot)+"\t"+str(reward)+"\t"+str(uRegret)+"\t"+str(pull)
+"\t"+str(ucbBandit.getValueOfArm(pull))+"\n")
reward,regret,pull=tomBandit.getRewardAndRegret()
tomRegret+=regret
f_tom.write(str(slot)+"\t"+str(reward)+"\t"+str(tomRegret)+"\t"+str(pull)
+"\t"+str(tomBandit.getValueOfArm(pull))+"\n")
f_epsilon.close()
f_ucb.close()
f_tom.close()
gnuplot畫圖腳本:
#! /bin/sh
file1=epsilon.txt
file2=ucb.txt
file3=ucb_old.txt
file4=tom.txt
gnuplot<<!
set grid
set xlabel "index"
set ylabel "regret"
set xrange [0:2000]
set yrange [0:1000]
set term "png"
set output "compare.png"
plot "${file1}" u 1:3 title "epsilon" with lines lw 2,\
"${file2}" u 1:3 title "UCB-smooth" with lines lw 2,\
"${file3}" u 1:3 title "UCB-average" with lines lw 2,\
"${file4}" u 1:3 title "tom" with lines lw 2
set output
exit
!
實驗結果:
UCB average,採用的是平均值。UCB smooth,採用的指數濾波。
#average=(self.avergeRewards[choice][1]*pull+v)/(pull+1)
#self.avergeRewards[choice]=(pull+1,average)
smooth=self.avergeRewards[choice][1]*(1-self.alpha)+(self.alpha)*v
self.avergeRewards[choice]=(pull+1,smooth)
前面的實驗,一個arm的獎勵值,期望是固定的。將期望值隨機化一下。根據下圖的測試結果,可以看到,UCB算法就變現出優勢。
sigma1=[2,4,6,8]
sigma2=[4,1,3,7]
[1] multi-arm-bandits問題python代碼
[2] 求通俗解釋下bandit老虎機到底是個什麼東西?
[3] The Multi-Armed Bandit Problem and Its Solutions
[4] Multi-Armed Bandit筆記補充
[5] c++實現瑞利分佈
[6] Multi Armed Bandits and Exploration Strategies
[7] Boltzmann Exploration Done Right