multi-arm bandit模型代碼分享

 實現了一個multiarm bandit的代碼,主要實現的算法:ϵ\epsilon-greedy和UCB,Thompson Sampling。UCB的選擇依據爲:
a=maxaIaIa=ra+σa×2×log(T)N(a) a=\max_a I_a \\ I_a=\overline r_a+\sigma_a\times \sqrt{\frac{2\times log(T)}{N(a)}}
σ\sigma是個tuning factor,在代碼中實現爲觀測窗口內的最大值。因爲我模擬的arm產生的reward不是二值的(0或者1),而是根據瑞利分佈[5]生成的隨機數。關於這個σ\sigma,可以看看[4]的理論。
 Thompson Sampling,有兩個關於beta分佈參數(a,b)。只有當獲取的獎勵小於各個arm的平均獎勵時,b值增加1,否則a增加1。這是我想到的辦法,沒啥理論。

        sum=0;
        for i in range(arms):
            sum+=self.avergeRewards[i][1]
        measure=sum/arms
        a=self.ab[choice][0]
        b=self.ab[choice][1]
        if v>measure:
            a=a+1
            self.ab[choice]=(a,b)
        else:
            b=b+1
            self.ab[choice]=(a,b)

代碼 mab.py

import os
import random
import math
import numpy as np
class Arm(object):
    def getReward(self):
        return 0
    def getID(self):
        return 0
    def Update(self,slot):
        0
class RayleighArm(Arm):
    def __init__(self,id,seed,sigmas):
        self.rand=random.Random()
        self.rand.seed(seed)
        self.sigmarand=random.Random()
        self.sigmarand.seed(seed)
        self.id=id
        self.sigmas=[]
        for i in range(len(sigmas)):
            self.sigmas.append(sigmas[i])
        self.update_counter=0
        i=self.sigmarand.randint(0,3000)%len(self.sigmas)
        self.sigma=self.sigmas[i]
        self.slot=0
        self.value=0;
        self.generateV()
    def generateV(self):
        r=self.rand.uniform(0,1)
        self.value=self.sigma*math.sqrt(-2 * math.log(r))
        self.update_counter+=1
        if self.update_counter>100:
            self.update_counter=0
            i=self.sigmarand.randint(0,3000)%len(self.sigmas)
            self.sigma=self.sigmas[i]
    def getReward(self):
        return self.value;
    def getID(self):
        return self.id
    def Update(self,slot):
        if slot>self.slot:
            self.generateV()
            self.slot=slot
class Bandit(object):
    def __init__(self):
        self.arms=[]
    def RegisterArm(self,arm):
        items=len(self.arms)
        if items:
            exist=False
            for i in range(items):
                if arm.getID()==self.arms[i].getID():
                    exist=True
                    break
            if not exist:
                self.arms.append(arm)
        else:
            self.arms.append(arm)
    def getArmsSize(self):
        return len(self.arms)
    def getValueOfArm(self,i):
        return self.arms[i].getReward()
    def getOptimal(self):
        items=len(self.arms)
        max=self.arms[0].getReward()
        for i in range(items-1):
            v=self.arms[i+1].getReward()
            if(v>max):
                max=v
        return max
    def getRewardAndRegret(self):
        return 0,0,0
class EpsilonBandit(Bandit):
    def __init__(self,seed,epsilon):
        Bandit.__init__(self)
        self.rand=random.Random()
        self.randint=random.Random()
        self.rand.seed(seed)
        self.randint.seed(seed);
        self.epsilon=epsilon
        self.avergeRewards=[]
    def RegisterArm(self,arm):
        Bandit.RegisterArm(self,arm)
    def getValueOfArm(self,i):
        return Bandit.getValueOfArm(self,i)
    def getRewardAndRegret(self):
        reward=0
        regret=0
        arms=self.getArmsSize()
        optimal=self.getOptimal()
        pull=0
        if not len(self.avergeRewards):
            reward=optimal
            largest=self.getValueOfArm(0)
            for i in range(arms):
                v=self.getValueOfArm(i)
                self.avergeRewards.append((1,v))
                if v>largest:
                    largest=v
                    pull=i
        else:
            pull=self.decision()
            reward=self.getValueOfArm(pull)
            regret=optimal-reward
        return reward,regret,pull
    def decision(self):
        e=self.rand.uniform(0,1)
        arms=self.getArmsSize()
        choice=0
        if e<self.epsilon and arms>1:
            choice=self.randint.randint(0,1000)%arms
        else:
            largest=self.avergeRewards[0][1]
            for i in range(arms-1):
                if self.avergeRewards[i+1][1]>largest:
                    choice=i+1
                    largest=self.avergeRewards[i+1][1]
        v=self.getValueOfArm(choice)
        pull=self.avergeRewards[choice][0]
        average=(self.avergeRewards[choice][1]*pull+v)/(pull+1)
        self.avergeRewards[choice]=(pull+1,average)
        return choice
class UCBBandit(Bandit):
    def __init__(self,alpha,window):
        Bandit.__init__(self)
        self.alpha=alpha
        self.avergeRewards=[]
        self.T=1
        self.window=window
        self.windowRecord=[]
    def getWindowMaxOfArm(self,arm):
        max=0
        items=len(self.windowRecord[arm])
        if not items:
            max=self.windowRecord[arm][0]
        for i in range(items):
            if self.windowRecord[arm][i]>max:
                max=self.windowRecord[arm][i]
        return max
    def UpdateWindowSample(self,arm,sample):
        self.windowRecord[arm].append(sample)
        items=len(self.windowRecord[arm])
        new=[]
        if items>self.window:
            for i in range(self.window):
                new.append(self.windowRecord[arm][items-self.window+i])
            self.windowRecord[arm]=new
    def RegisterArm(self,arm):
        Bandit.RegisterArm(self,arm)
    def getValueOfArm(self,i):
        return Bandit.getValueOfArm(self,i)
    def getRewardAndRegret(self):
        reward=0
        regret=0
        arms=self.getArmsSize()
        optimal=self.getOptimal()
        pull=0
        if not len(self.avergeRewards):
            reward=optimal
            largest=self.getValueOfArm(0)
            for i in range(arms):
                v=self.getValueOfArm(i)
                a=[]
                a.append(v)
                self.windowRecord.append(a)
                self.avergeRewards.append((1,v))
                if v>largest:
                    largest=v
                    pull=i
        else:
            pull=self.decision()
            reward=self.getValueOfArm(pull)
            regret=optimal-reward
            self.UpdateWindowSample(pull,reward)
        self.T+=1
        return reward,regret,pull
    def decision(self):
        arms=self.getArmsSize()
        choice=0
        measure=[]
        for i in range(arms):
            N=self.avergeRewards[i][0]
            tun=self.getWindowMaxOfArm(i)
            m=self.avergeRewards[i][1]+tun*math.sqrt(2*math.log(self.T)/N)
            measure.append(m)
        max_m=measure[0]
        for i in range(arms-1):
            if measure[i+1]>max_m:
                max_m=measure[i+1]
                choice=i+1
        v=self.getValueOfArm(choice)
        pull=self.avergeRewards[choice][0]
        #average=(self.avergeRewards[choice][1]*pull+v)/(pull+1)
        #self.avergeRewards[choice]=(pull+1,average)
        smooth=self.avergeRewards[choice][1]*(1-self.alpha)+(self.alpha)*v
        self.avergeRewards[choice]=(pull+1,smooth)
        return choice
#https://visualstudiomagazine.com/articles/2019/06/01/thompson-sampling.aspx
class Thompson(Bandit):
    def __init__(self,seed):
        Bandit.__init__(self)
        self.rand=np.random.RandomState(seed)
        self.ab=[]
        self.avergeRewards=[]
    def RegisterArm(self,arm,a,b):
        Bandit.RegisterArm(self,arm)
        self.ab.append((a,b))
    def getValueOfArm(self,i):
        return Bandit.getValueOfArm(self,i)
    def getRewardAndRegret(self):
        reward=0
        regret=0
        arms=self.getArmsSize()
        optimal=self.getOptimal()
        choice=0
        if not len(self.avergeRewards):
            reward=optimal
            largest=self.getValueOfArm(0)
            for i in range(arms):
                v=self.getValueOfArm(i)
                self.avergeRewards.append((1,v))
                if v>largest:
                    largest=v
                    choice=i
        else:
            choice=self.decision()
            reward=self.getValueOfArm(choice)
            regret=optimal-reward
        return reward,regret,choice
    def decision(self):
        choice=0
        arms=self.getArmsSize()
        max=self.rand.beta(self.ab[choice][0]+1,self.ab[choice][1]+1)
        for i in range(arms-1):
            temp=self.rand.beta(self.ab[i+1][0]+1,self.ab[i+1][1]+1)
            if temp>max:
                max=temp
                choice=i+1
        v=self.getValueOfArm(choice)
        pull=self.avergeRewards[choice][0]
        average=(self.avergeRewards[choice][1]*pull+v)/(pull+1)
        self.avergeRewards[choice]=(pull+1,average)
        sum=0;
        for i in range(arms):
            sum+=self.avergeRewards[i][1]
        measure=sum/arms
        a=self.ab[choice][0]
        b=self.ab[choice][1]
        if v>=measure:
            a=a+1
            self.ab[choice]=(a,b)
        else:
            b=b+1
            self.ab[choice]=(a,b)
        return choice
totalExperiment=2000
#averge=sigma*math.sqrt(math.pi/2)
sigma1=[2]
id=1
arms=[]
eBandit=EpsilonBandit(12373,0.2)
ucbBandit=UCBBandit(0.8,10)
tomBandit=Thompson(12323)
arm=RayleighArm(id,12323,sigma1)
id=id+1
eBandit.RegisterArm(arm);
ucbBandit.RegisterArm(arm);
tomBandit.RegisterArm(arm,50,50)
arms.append(arm)

sigma2=[5]
arm=RayleighArm(id,12446,sigma2)
id=id+1

eBandit.RegisterArm(arm);
ucbBandit.RegisterArm(arm);
tomBandit.RegisterArm(arm,50,50)
arms.append(arm)

eRegret=0.0
uRegret=0.0
tomRegret=0.0
f_epsilon=open("epsilon.txt",'w')
f_ucb=open("ucb.txt",'w')
f_tom=open("tom.txt",'w')
for slot in range(totalExperiment):
    for j in range(len(arms)):
        arms[j].Update(slot)
    reward,regret,pull=eBandit.getRewardAndRegret()
    eRegret+=regret
    f_epsilon.write(str(slot)+"\t"+str(reward)+"\t"+str(eRegret)+"\t"+str(pull)
    +"\t"+str(eBandit.getValueOfArm(pull))+"\n")
    reward,regret,pull=ucbBandit.getRewardAndRegret()
    uRegret+=regret
    f_ucb.write(str(slot)+"\t"+str(reward)+"\t"+str(uRegret)+"\t"+str(pull)
    +"\t"+str(ucbBandit.getValueOfArm(pull))+"\n")
    reward,regret,pull=tomBandit.getRewardAndRegret()
    tomRegret+=regret
    f_tom.write(str(slot)+"\t"+str(reward)+"\t"+str(tomRegret)+"\t"+str(pull)
    +"\t"+str(tomBandit.getValueOfArm(pull))+"\n")    
f_epsilon.close()
f_ucb.close()
f_tom.close()

 gnuplot畫圖腳本:

#! /bin/sh
file1=epsilon.txt
file2=ucb.txt
file3=ucb_old.txt
file4=tom.txt
gnuplot<<!
set grid
set xlabel "index" 
set ylabel "regret"
set xrange [0:2000]
set yrange [0:1000]
set term "png"
set output "compare.png"
plot "${file1}" u 1:3 title "epsilon" with lines lw 2,\
"${file2}" u 1:3 title "UCB-smooth" with lines lw 2,\
"${file3}" u 1:3 title "UCB-average" with lines lw 2,\
"${file4}" u 1:3 title "tom" with lines lw 2
set output
exit
!

 實驗結果:
在這裏插入圖片描述
 UCB average,r\overline r採用的是平均值。UCB smooth,r\overline r採用的指數濾波。

        #average=(self.avergeRewards[choice][1]*pull+v)/(pull+1)
        #self.avergeRewards[choice]=(pull+1,average)
        smooth=self.avergeRewards[choice][1]*(1-self.alpha)+(self.alpha)*v
        self.avergeRewards[choice]=(pull+1,smooth)

 前面的實驗,一個arm的獎勵值,期望是固定的。將期望值隨機化一下。根據下圖的測試結果,可以看到,UCB算法就變現出優勢。

sigma1=[2,4,6,8]
sigma2=[4,1,3,7]

在這裏插入圖片描述

[1] multi-arm-bandits問題python代碼
[2] 求通俗解釋下bandit老虎機到底是個什麼東西?
[3] The Multi-Armed Bandit Problem and Its Solutions
[4] Multi-Armed Bandit筆記補充
[5] c++實現瑞利分佈
[6] Multi Armed Bandits and Exploration Strategies
[7] Boltzmann Exploration Done Right

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章