本博文講的是Reinforcement Learning:An Introduction第二版,這本書的第二章節關於multi-armed bandits algorithm的python代碼實現。整本書的代碼實現在github上有,比較官方:https://github.com/ShangtongZhang/reinforcement-learning-an-introduction。當我第一次看到這個代碼的時候,感覺讀起來有點晦澀,雖然整個代碼看起來很工整,但是仔細分析一下邏輯關係會發現代碼之間的耦合太多了,當然,我這裏只是說第二章的代碼,其他章節的代碼還沒看過。該鏈接中提供的第二章的代碼如下:
#######################################################################
# Copyright (C) #
# 2016 Shangtong Zhang([email protected]) #
# 2016 Tian Jun([email protected]) #
# 2016 Artem Oboturov([email protected]) #
# 2016 Kenta Shimada([email protected]) #
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
class Bandit:
# @kArm: # of arms
# @epsilon: probability for exploration in epsilon-greedy algorithm
# @initial: initial estimation for each action
# @stepSize: constant step size for updating estimations
# @sampleAverages: if True, use sample averages to update estimations instead of constant step size
# @UCB: if not None, use UCB algorithm to select action
# @gradient: if True, use gradient based bandit algorithm
# @gradientBaseline: if True, use average reward as baseline for gradient based bandit algorithm
def __init__(self, kArm=10, epsilon=0., initial=0., stepSize=0.1, sampleAverages=False, UCBParam=None,
gradient=False, gradientBaseline=False, trueReward=0.):
self.k = kArm
self.stepSize = stepSize
self.sampleAverages = sampleAverages
self.indices = np.arange(self.k)
self.time = 0
self.UCBParam = UCBParam
self.gradient = gradient
self.gradientBaseline = gradientBaseline
self.averageReward = 0
self.trueReward = trueReward
# real reward for each action
self.qTrue = []
# estimation for each action
self.qEst = np.zeros(self.k)
# # of chosen times for each action
self.actionCount = []
self.epsilon = epsilon
# initialize real rewards with N(0,1) distribution and estimations with desired initial value
for i in range(0, self.k):
self.qTrue.append(np.random.randn() + trueReward)
self.qEst[i] = initial
self.actionCount.append(0)
self.bestAction = np.argmax(self.qTrue)
# get an action for this bandit, explore or exploit?
def getAction(self):
# explore
if self.epsilon > 0:
if np.random.binomial(1, self.epsilon) == 1:
return np.random.choice(self.indices)
# exploit
if self.UCBParam is not None:
UCBEst = self.qEst + \
self.UCBParam * np.sqrt(np.log(self.time + 1) / (np.asarray(self.actionCount) + 1))
return np.argmax(UCBEst)
if self.gradient:
expEst = np.exp(self.qEst)
self.actionProb = expEst / np.sum(expEst)
return np.random.choice(self.indices, p=self.actionProb)
return np.argmax(self.qEst)
# take an action, update estimation for this action
def takeAction(self, action):
# generate the reward under N(real reward, 1)
reward = np.random.randn() + self.qTrue[action]
self.time += 1
self.averageReward = (self.time - 1.0) / self.time * self.averageReward + reward / self.time
self.actionCount[action] += 1
if self.sampleAverages:
# update estimation using sample averages
self.qEst[action] += 1.0 / self.actionCount[action] * (reward - self.qEst[action])
elif self.gradient:
oneHot = np.zeros(self.k)
oneHot[action] = 1
if self.gradientBaseline:
baseline = self.averageReward
else:
baseline = 0
self.qEst = self.qEst + self.stepSize * (reward - baseline) * (oneHot - self.actionProb)
else:
# update estimation with constant step size
self.qEst[action] += self.stepSize * (reward - self.qEst[action])
return reward
figureIndex = 0
# for figure 2.1
def figure2_1():
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
sns.violinplot(data=np.random.randn(200,10) + np.random.randn(10))
plt.xlabel("Action")
plt.ylabel("Reward distribution")
def banditSimulation(nBandits, time, bandits):
bestActionCounts = [np.zeros(time, dtype='float') for _ in range(0, len(bandits))]
averageRewards = [np.zeros(time, dtype='float') for _ in range(0, len(bandits))]
for banditInd, bandit in enumerate(bandits):
for i in range(0, nBandits):
for t in range(0, time):
action = bandit[i].getAction()
reward = bandit[i].takeAction(action)
averageRewards[banditInd][t] += reward
if action == bandit[i].bestAction:
bestActionCounts[banditInd][t] += 1
bestActionCounts[banditInd] /= nBandits
averageRewards[banditInd] /= nBandits
return bestActionCounts, averageRewards
# for figure 2.2
def epsilonGreedy(nBandits, time):
epsilons = [0, 0.1, 0.01]
bandits = []
for epsInd, eps in enumerate(epsilons):
bandits.append([Bandit(epsilon=eps, sampleAverages=True) for _ in range(0, nBandits)])
bestActionCounts, averageRewards = banditSimulation(nBandits, time, bandits)
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
for eps, counts in zip(epsilons, bestActionCounts):
plt.plot(counts, label='epsilon = '+str(eps))
plt.xlabel('Steps')
plt.ylabel('% optimal action')
plt.legend()
plt.figure(figureIndex)
figureIndex += 1
for eps, rewards in zip(epsilons, averageRewards):
plt.plot(rewards, label='epsilon = '+str(eps))
plt.xlabel('Steps')
plt.ylabel('average reward')
plt.legend()
# for figure 2.3
def optimisticInitialValues(nBandits, time):
bandits = [[], []]
bandits[0] = [Bandit(epsilon=0, initial=5, stepSize=0.1) for _ in range(0, nBandits)]
bandits[1] = [Bandit(epsilon=0.1, initial=0, stepSize=0.1) for _ in range(0, nBandits)]
bestActionCounts, _ = banditSimulation(nBandits, time, bandits)
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
plt.plot(bestActionCounts[0], label='epsilon = 0, q = 5')
plt.plot(bestActionCounts[1], label='epsilon = 0.1, q = 0')
plt.xlabel('Steps')
plt.ylabel('% optimal action')
plt.legend()
# for figure 2.4
def ucb(nBandits, time):
bandits = [[], []]
bandits[0] = [Bandit(epsilon=0, stepSize=0.1, UCBParam=2) for _ in range(0, nBandits)]
bandits[1] = [Bandit(epsilon=0.1, stepSize=0.1) for _ in range(0, nBandits)]
_, averageRewards = banditSimulation(nBandits, time, bandits)
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
plt.plot(averageRewards[0], label='UCB c = 2')
plt.plot(averageRewards[1], label='epsilon greedy epsilon = 0.1')
plt.xlabel('Steps')
plt.ylabel('Average reward')
plt.legend()
# for figure 2.5
def gradientBandit(nBandits, time):
bandits =[[], [], [], []]
bandits[0] = [Bandit(gradient=True, stepSize=0.1, gradientBaseline=True, trueReward=4) for _ in range(0, nBandits)]
bandits[1] = [Bandit(gradient=True, stepSize=0.1, gradientBaseline=False, trueReward=4) for _ in range(0, nBandits)]
bandits[2] = [Bandit(gradient=True, stepSize=0.4, gradientBaseline=True, trueReward=4) for _ in range(0, nBandits)]
bandits[3] = [Bandit(gradient=True, stepSize=0.4, gradientBaseline=False, trueReward=4) for _ in range(0, nBandits)]
bestActionCounts, _ = banditSimulation(nBandits, time, bandits)
labels = ['alpha = 0.1, with baseline',
'alpha = 0.1, without baseline',
'alpha = 0.4, with baseline',
'alpha = 0.4, without baseline']
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
for i in range(0, len(bandits)):
plt.plot(bestActionCounts[i], label=labels[i])
plt.xlabel('Steps')
plt.ylabel('% Optimal action')
plt.legend()
# Figure 2.6
def figure2_6(nBandits, time):
labels = ['epsilon-greedy', 'gradient bandit',
'UCB', 'optimistic initialization']
generators = [lambda epsilon: Bandit(epsilon=epsilon, sampleAverages=True),
lambda alpha: Bandit(gradient=True, stepSize=alpha, gradientBaseline=True),
lambda coef: Bandit(epsilon=0, stepSize=0.1, UCBParam=coef),
lambda initial: Bandit(epsilon=0, initial=initial, stepSize=0.1)]
parameters = [np.arange(-7, -1, dtype=np.float),
np.arange(-5, 2, dtype=np.float),
np.arange(-4, 3, dtype=np.float),
np.arange(-2, 3, dtype=np.float)]
bandits = [[generator(pow(2, param)) for _ in range(0, nBandits)] for generator, parameter in zip(generators, parameters) for param in parameter]
_, averageRewards = banditSimulation(nBandits, time, bandits)
rewards = np.sum(averageRewards, axis=1)/time
global figureIndex
plt.figure(figureIndex)
figureIndex += 1
i = 0
for label, parameter in zip(labels, parameters):
l = len(parameter)
plt.plot(parameter, rewards[i:i+l], label=label)
i += l
plt.xlabel('Parameter(2^x)')
plt.ylabel('Average reward')
plt.legend()
figure2_1()
epsilonGreedy(2000, 1000)
optimisticInitialValues(2000, 1000)
ucb(2000, 1000)
gradientBandit(2000, 1000)
# This will take somehow a long time
figure2_6(2000, 1000)
plt.show()
在我個人看來,強化學習強調的是智能體在未知環境下完成設定的任務,根據環境中反饋的激勵信號來調整自身的動作策略。因此強化學習是一個環境與智能體交互,智能體根據交互信息調節自身動作策略的過程。在強化學習的代碼中,就仿真代碼而言,應該把環境對象和智能體對象來着分離,並利用面向過程的編程範式,編寫函數將兩者橋接起來。上述的代碼糅合在一起了,讓人一時間很難看懂。
下面我將根據上述代碼的思路,重新根據面向對象的編程範式,編寫新的代碼:
①爲環境創建一個類——Bandits
②爲智能體創建一個類——Agent
③交互的過程是(看simulation函數): 智能體選擇並執行動作(chooseAction),環境根據動作產生激勵信號(yieldReward),智能體根據反饋信號調整動作(updatePolicy)。上述不斷循環,從而使得智能體的策略不斷根據激勵信號被優化。
智能體應該具有的一些成員函數和成員變量爲:
①與環境相關的變量,可採取的動作類型(如賭博機的臂數)
②選擇動作函數,更新策略函數,保存策略函數和加載策略函數以及最後的智能體復位(回到最原始的狀態)
環境應該具有的成員函數和成員變量爲:
①根據問題需要用到的參數,每個動作的值
②最佳動作(如果是仿真環境下的話,可用於評估設計的算法的優劣),環境變化(能夠在一定範圍內變化),產生激勵信號(對智能體的動作有一定的反饋)
調整後的代碼如下所示:
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
#######################################################################
# Copyright (C) #
# 2018 Dianye Huang ([email protected])
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
# 改程序參考github上的強化學習課程程序重新編寫,邏輯更加清晰。編寫環境和智能體兩個類進行交互,將環境的信息
# 與智能體的信息隔離,使智能體成爲單一的獨立個體,並能夠對智能體強化訓練得到的參數復位,重新進行訓練,以檢
# 冊算法的穩定性。
# stationary problem , 動作的價值是固定的,沒有變化,而反饋給智能體的價值是夾雜了噪音的,需要通過多次行爲來確定。
# bandit problem 是與當前狀態沒有關係的單步決策問題,不需要考慮上一時刻狀態。動作作用於環境,然後直接獲得reward
# non associative tasks, 動作的連續性和上下關聯不大,
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 環境部分 environment 負責接收動作action並反饋動作信息reward
class Bandit:
def __init__(self, kArm=10, trueReward = 0): # 設置相關初始變量
self.kArm = kArm
self.trueReward = trueReward # 真實的每個動作對應的reward
self.actionCount = 0 # 記錄動作次數
self.time = 0 # 記錄運行時間
# 根據輸入的單一的反饋值,自動生成對於各個臂的反饋值,用於驗證算法使用
self.qTrue = []
for i in range(0, self.kArm):
self.qTrue.append(np.random.randn()+self.trueReward)
def change(self):
# 環境的改變即動作值的改變
self.qTrue = []
for i in range(0, self.kArm):
self.qTrue.append(np.random.randn()+self.trueReward)
def yieldReward(self, action): # 反饋獎勵
self.time += 1
return np.random.randn() + self.qTrue[action]
def getBestAction(self): # 上帝視角的最優動作
return int(np.argmax(self.qTrue))
def getkArms(self): # 環境參數配置輸出,供agent使用,用於作爲智能體表現性能的參照指標
return self.kArm
def getTime(self):
return self.time
def showqTrue(self, figureIndex):
plt.figure(figureIndex)
print('qTrue:', self.qTrue)
sns.violinplot(data=self.qTrue + np.random.randn(200, 10))
plt.xlabel("Action")
plt.ylabel("Reward distribution")
# 智能體部分 agent,他能夠執行的功能包括 ①選擇動作,②從環境中獲取reward,③調整動作策略, ④學習參數復位
# 使用動作值估計的方法使收益最大化
class Agent:
def __init__(self, actionNum=10, method='SampleAverages',paraList=None):
self.time = 0
self.actionNum = actionNum
self.currenAction = None
self.method = method
if self.method == 'SampleAverages':
# para[0]->epsilon
self.epsilon = paraList[0]
self.qEst = np.zeros(actionNum) # 估計每個動作的值, Est->estimate
self.actionCount = np.zeros(actionNum)
elif self.method == 'Incremental':
# para[0]->epsilon; para[1]->step size
self.epsilon = paraList[0]
self.stepSize = paraList[1]
self.qEst = np.zeros(actionNum) # 估計每個動作的值, Est->estimate
elif self.method == 'OptimisticInitial': # 參數1 步長,參數2 reward初始值
# para[0]->epsilon; para[1]->step size; para[2]-> optimistic values
self.epsilon = paraList[0]
self.stepSize = paraList[1]
self.optimisticValue = paraList[2]
self.qEst = np.zeros(actionNum) + self.optimisticValue # 估計每個動作的值, Est->estimate
elif self.method == 'UCB': # Upper Confidance Bound method, action-value + explaoration factor
# para[0]->step size; para[1]->c
self.stepSize = paraList[0] # params: stepSize, optimistic value and c(control the degree of exploration)
self.c = paraList[1]
self.qEst = np.zeros(self.actionNum)
self.actionCount = np.zeros(actionNum)
elif self.method == 'Gradient':
self.sum = 0
self.Ht = np.zeros(self.actionNum)
self.alpha = paraList[0]
self.baseLine = paraList[1]
self.averageReward = 0
self.actionProb = np.zeros(self.actionNum)
def chooseAction(self):
self.time += 1 # 每個時間步執行一個動作
if self.method == 'SampleAverages' or self.method == 'Incremental' or self.method == 'OptimisticInitial':
# 屬於epsilon-greedy的策略,根據動作值進行動作的選擇
# explore \ epsilon probability for exploration
if self.epsilon > 0:
if np.random.binomial(1, self.epsilon) == 1:
self.currenAction = np.random.choice(self.actionNum) # 隨機返回一個動作
return self.currenAction
# exploit -- greedy policy 1-epsilon probability for exploitation
self.currenAction = int(np.argmax(self.qEst)) # 公式(2-2)的策略
elif self.method == 'UCB':
explrProb = self.c*np.sqrt(np.log(self.time)/(self.actionCount+1)) # 1 for the case of divided by zero
self.currenAction = int(np.argmax(self.qEst+explrProb))
elif self.method == 'Gradient':
# 更新選擇動作的概率
expEst = np.exp(self.Ht)
self.actionProb = expEst / np.sum(expEst) # soft-max function 公式(2.9)
self.currenAction = np.random.choice(self.actionNum, p=self.actionProb)
return self.currenAction
def updatePolicy(self, reward):
# 更新動作值的估計, qEst_update
if self.method == 'SampleAverages':
self.actionCount[self.currenAction] += 1 # 統計執行的動作
self.qEst[self.currenAction] += 1.0/self.actionCount[self.currenAction]*(reward - self.qEst[self.currenAction]) # 利用迭代的方法,可以不用去累加Reward,做一個簡單的推導即可 書本P21 公式(2.1)
elif self.method == 'Incremental' or self.method == 'OptimisticInitial':
self.qEst[self.currenAction] += self.stepSize*(reward-self.qEst[self.currenAction]) # exponential recency-weighted average
elif self.method == 'UCB': # optimistic initial values + incremental + ucb
self.actionCount[self.currenAction] += 1
self.qEst[self.currenAction] += self.stepSize * (reward - self.qEst[self.currenAction])
elif self.method == 'Gradient':
oneHot = np.zeros(self.actionNum)
oneHot[self.currenAction] = 1
if self.baseLine:
self.averageReward += (reward - self.averageReward)/ float(self.time) # 計算baseline期望均值
self.Ht += self.alpha * (reward - self.averageReward) * (oneHot - self.actionProb)
else:
self.Ht += self.alpha * reward * (oneHot - self.actionProb)
def reset(self):
self.time = 0 # 重新復位策略後時間重新計算
# 參數復位
if self.method == 'SampleAverages':
self.qEst = np.zeros(self.actionNum)
self.actionCount = np.zeros(self.actionNum) # numpy 包下的array數據類型的操作,python自帶的列表類型無法使用
elif self.method == 'Incremental':
self.qEst = np.zeros(self.actionNum)
elif self.method == 'OptimisticInitial': # 參數1 步長,參數2 reward初始值
self.qEst = np.zeros(self.actionNum) + self.optimisticValue # 估計每個動作的值, Est->estimate
elif self.method == 'UCB':
self.qEst = np.zeros(self.actionNum)
self.actionCount = np.zeros(self.actionNum)
elif self.method == 'Gradient':
self.averageReward = 0
self.Ht = np.zeros(self.actionNum)
self.actionProb = np.zeros(self.actionNum)
def savePolicy(self):
pass
def loadPolicy(self):
pass
# 開始仿真 並 記錄數據,整合連接智能體與環境之間的交互,記錄交互數據的仿真函數
def simulation(env, player, nBandits, time):
# 智能體表現指標記錄變量
bestCount = np.zeros(time) # 最佳動作統計
averageReward = np.zeros(time) # 平均反饋累加值
# 開始進行n輪bandits, 每輪time個時間步
for i in range(0, nBandits):
env.change() # 改變環境反饋的動作值,在一定範圍內改變
for t in range(time):
# agent與環境交互和調整動作策略的過程
action = player.chooseAction() # player選擇動作
reward = env.yieldReward(action) # 環境根據動作返回獎勵
player.updatePolicy(reward) # player根據當前動作得到的reward調整策略,更新動作值
# 統計記錄策略調整的效果的響應指標
averageReward[t] += reward
if action == env.getBestAction(): # 當前的決策與環境最好的動作相等時,累計加1
bestCount[t] += 1
# 重置智能體策略,用於評估算法的穩定性
player.reset() # 從0到time 訓練完畢後reset policy進行下一次的訓練,查看訓練效果,從而估計算法的平均性能
# 求解平均性能
averageReward /= nBandits
bestCount /= nBandits
return averageReward, bestCount
# 用於畫圖使用
# 畫圖主要畫兩種圖,①平均獎勵值, ②最優動作概率
class Plot:
def __init__(self):
pass
def plotting(self, data, figureIndex, labelStr, xStr, yStr):
plt.figure(figureIndex) # 設置畫圖標號
plt.plot(data, label=labelStr) # 繪製數據圖表,設置圖標
plt.xlabel(xStr) # x軸標籤
plt.ylabel(yStr) # y軸標籤
plt.legend() # 打開圖例
####################################################################################################
if __name__ == '__main__':
# 繪圖對象實例化
plot = Plot()
# 交互對象實例化
env = Bandit(10, 4) # 實例化環境對象
env.showqTrue(figureIndex=1) # 顯示reward參數分佈
# --------------- 智能體1 仿真-------------
# 特定的智能體的實例化
player = Agent(env.kArm, method='Gradient', paraList=[0.1, False])
# 開始仿真並返回待記錄數據
avgReward, bestCount = simulation(env, player, 2000, 1000)
# 繪製相關圖表
plot.plotting(avgReward, 2, 'Gradient (alpha=0.1, without baseline)', 'Steps', 'Average Reward')
plot.plotting(bestCount, 3, 'Gradient (alpha=0.1, without baseline)', 'Steps', 'Optimal Action')
# ---------------- 智能體2 仿真-------------
player = Agent(env.kArm, method='Gradient', paraList=[0.1, True])
avgReward, bestCount = simulation(env, player, 2000, 1000)
plot.plotting(avgReward, 2, 'Gradient (alpha=0.1, with baseline)', 'Steps', 'Average Reward')
plot.plotting(bestCount, 3, 'Gradient (alpha=0.1, with baseline)', 'Steps', 'Optimal Action')
# ---------------- 智能體3 仿真-------------
player = Agent(env.kArm, method='Gradient', paraList=[0.4, False])
avgReward, bestCount = simulation(env, player, 2000, 1000)
plot.plotting(avgReward, 2, 'Gradient (alpha=0.4, without baseline)', 'Steps', 'Average Reward')
plot.plotting(bestCount, 3, 'Gradient (alpha=0.4, without baseline)', 'Steps', 'Optimal Action')
# ---------------- 智能體4 仿真-------------
player = Agent(env.kArm, method='Gradient', paraList=[0.4, True])
avgReward, bestCount = simulation(env, player, 2000, 1000)
plot.plotting(avgReward, 2, 'Gradient (alpha=0.4, with baseline)', 'Steps', 'Average Reward')
plot.plotting(bestCount, 3, 'Gradient (alpha=0.4, with baseline)', 'Steps', 'Optimal Action')
# 打開繪製圖表開關
plt.show()
輸出結果如下:
與書本上的圖2.5結果一致: