Reinforcement Learning: an introduction 編程筆記——第二章

本博文講的是Reinforcement Learning:An Introduction第二版，這本書的第二章節關於multi-armed bandits algorithm的python代碼實現。整本書的代碼實現在github上有，比較官方：https://github.com/ShangtongZhang/reinforcement-learning-an-introduction。當我第一次看到這個代碼的時候，感覺讀起來有點晦澀，雖然整個代碼看起來很工整，但是仔細分析一下邏輯關係會發現代碼之間的耦合太多了，當然，我這裏只是說第二章的代碼，其他章節的代碼還沒看過。該鏈接中提供的第二章的代碼如下：

#######################################################################
# Copyright (C)                                                       #
# 2016 Shangtong Zhang([email protected])                  #
# 2016 Tian Jun([email protected])                                #
# 2016 Artem Oboturov([email protected])                             #
# 2016 Kenta Shimada([email protected])                         #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

class Bandit:
    # @kArm: # of arms
    # @epsilon: probability for exploration in epsilon-greedy algorithm
    # @initial: initial estimation for each action
    # @stepSize: constant step size for updating estimations
    # @sampleAverages: if True, use sample averages to update estimations instead of constant step size
    # @UCB: if not None, use UCB algorithm to select action
    # @gradient: if True, use gradient based bandit algorithm
    # @gradientBaseline: if True, use average reward as baseline for gradient based bandit algorithm
    def __init__(self, kArm=10, epsilon=0., initial=0., stepSize=0.1, sampleAverages=False, UCBParam=None,
                 gradient=False, gradientBaseline=False, trueReward=0.):
        self.k = kArm
        self.stepSize = stepSize
        self.sampleAverages = sampleAverages
        self.indices = np.arange(self.k)
        self.time = 0
        self.UCBParam = UCBParam
        self.gradient = gradient
        self.gradientBaseline = gradientBaseline
        self.averageReward = 0
        self.trueReward = trueReward

        # real reward for each action
        self.qTrue = []

        # estimation for each action
        self.qEst = np.zeros(self.k)

        # # of chosen times for each action
        self.actionCount = []

        self.epsilon = epsilon

        # initialize real rewards with N(0,1) distribution and estimations with desired initial value
        for i in range(0, self.k):
            self.qTrue.append(np.random.randn() + trueReward)
            self.qEst[i] = initial
            self.actionCount.append(0)

        self.bestAction = np.argmax(self.qTrue)

    # get an action for this bandit, explore or exploit?
    def getAction(self):
        # explore
        if self.epsilon > 0:
            if np.random.binomial(1, self.epsilon) == 1:
                return np.random.choice(self.indices)

        # exploit
        if self.UCBParam is not None:
            UCBEst = self.qEst + \
                     self.UCBParam * np.sqrt(np.log(self.time + 1) / (np.asarray(self.actionCount) + 1))
            return np.argmax(UCBEst)
        if self.gradient:
            expEst = np.exp(self.qEst)
            self.actionProb = expEst / np.sum(expEst)
            return np.random.choice(self.indices, p=self.actionProb)
        return np.argmax(self.qEst)

    # take an action, update estimation for this action
    def takeAction(self, action):
        # generate the reward under N(real reward, 1)
        reward = np.random.randn() + self.qTrue[action]
        self.time += 1
        self.averageReward = (self.time - 1.0) / self.time * self.averageReward + reward / self.time
        self.actionCount[action] += 1

        if self.sampleAverages:
            # update estimation using sample averages
            self.qEst[action] += 1.0 / self.actionCount[action] * (reward - self.qEst[action])
        elif self.gradient:
            oneHot = np.zeros(self.k)
            oneHot[action] = 1
            if self.gradientBaseline:
                baseline = self.averageReward
            else:
                baseline = 0
            self.qEst = self.qEst + self.stepSize * (reward - baseline) * (oneHot - self.actionProb)
        else:
            # update estimation with constant step size
            self.qEst[action] += self.stepSize * (reward - self.qEst[action])
        return reward

figureIndex = 0

# for figure 2.1
def figure2_1():
    global figureIndex
    plt.figure(figureIndex)
    figureIndex += 1
    sns.violinplot(data=np.random.randn(200,10) + np.random.randn(10))
    plt.xlabel("Action")
    plt.ylabel("Reward distribution")

def banditSimulation(nBandits, time, bandits):
    bestActionCounts = [np.zeros(time, dtype='float') for _ in range(0, len(bandits))]
    averageRewards = [np.zeros(time, dtype='float') for _ in range(0, len(bandits))]
    for banditInd, bandit in enumerate(bandits):
        for i in range(0, nBandits):
            for t in range(0, time):
                action = bandit[i].getAction()
                reward = bandit[i].takeAction(action)
                averageRewards[banditInd][t] += reward
                if action == bandit[i].bestAction:
                    bestActionCounts[banditInd][t] += 1
        bestActionCounts[banditInd] /= nBandits
        averageRewards[banditInd] /= nBandits
    return bestActionCounts, averageRewards


# for figure 2.2
def epsilonGreedy(nBandits, time):
    epsilons = [0, 0.1, 0.01]
    bandits = []
    for epsInd, eps in enumerate(epsilons):
        bandits.append([Bandit(epsilon=eps, sampleAverages=True) for _ in range(0, nBandits)])
    bestActionCounts, averageRewards = banditSimulation(nBandits, time, bandits)
    global figureIndex
    plt.figure(figureIndex)
    figureIndex += 1
    for eps, counts in zip(epsilons, bestActionCounts):
        plt.plot(counts, label='epsilon = '+str(eps))
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.figure(figureIndex)
    figureIndex += 1
    for eps, rewards in zip(epsilons, averageRewards):
        plt.plot(rewards, label='epsilon = '+str(eps))
    plt.xlabel('Steps')
    plt.ylabel('average reward')
    plt.legend()


# for figure 2.3
def optimisticInitialValues(nBandits, time):
    bandits = [[], []]
    bandits[0] = [Bandit(epsilon=0, initial=5, stepSize=0.1) for _ in range(0, nBandits)]
    bandits[1] = [Bandit(epsilon=0.1, initial=0, stepSize=0.1) for _ in range(0, nBandits)]
    bestActionCounts, _ = banditSimulation(nBandits, time, bandits)
    global figureIndex
    plt.figure(figureIndex)
    figureIndex += 1
    plt.plot(bestActionCounts[0], label='epsilon = 0, q = 5')
    plt.plot(bestActionCounts[1], label='epsilon = 0.1, q = 0')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()


# for figure 2.4
def ucb(nBandits, time):
    bandits = [[], []]
    bandits[0] = [Bandit(epsilon=0, stepSize=0.1, UCBParam=2) for _ in range(0, nBandits)]
    bandits[1] = [Bandit(epsilon=0.1, stepSize=0.1) for _ in range(0, nBandits)]
    _, averageRewards = banditSimulation(nBandits, time, bandits)
    global figureIndex
    plt.figure(figureIndex)
    figureIndex += 1
    plt.plot(averageRewards[0], label='UCB c = 2')
    plt.plot(averageRewards[1], label='epsilon greedy epsilon = 0.1')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()


# for figure 2.5
def gradientBandit(nBandits, time):
    bandits =[[], [], [], []]
    bandits[0] = [Bandit(gradient=True, stepSize=0.1, gradientBaseline=True, trueReward=4) for _ in range(0, nBandits)]
    bandits[1] = [Bandit(gradient=True, stepSize=0.1, gradientBaseline=False, trueReward=4) for _ in range(0, nBandits)]
    bandits[2] = [Bandit(gradient=True, stepSize=0.4, gradientBaseline=True, trueReward=4) for _ in range(0, nBandits)]
    bandits[3] = [Bandit(gradient=True, stepSize=0.4, gradientBaseline=False, trueReward=4) for _ in range(0, nBandits)]
    bestActionCounts, _ = banditSimulation(nBandits, time, bandits)
    labels = ['alpha = 0.1, with baseline',
              'alpha = 0.1, without baseline',
              'alpha = 0.4, with baseline',
              'alpha = 0.4, without baseline']
    global figureIndex
    plt.figure(figureIndex)
    figureIndex += 1
    for i in range(0, len(bandits)):
        plt.plot(bestActionCounts[i], label=labels[i])
    plt.xlabel('Steps')
    plt.ylabel('% Optimal action')
    plt.legend()

# Figure 2.6
def figure2_6(nBandits, time):
    labels = ['epsilon-greedy', 'gradient bandit',
              'UCB', 'optimistic initialization']
    generators = [lambda epsilon: Bandit(epsilon=epsilon, sampleAverages=True),
                  lambda alpha: Bandit(gradient=True, stepSize=alpha, gradientBaseline=True),
                  lambda coef: Bandit(epsilon=0, stepSize=0.1, UCBParam=coef),
                  lambda initial: Bandit(epsilon=0, initial=initial, stepSize=0.1)]
    parameters = [np.arange(-7, -1, dtype=np.float),
                  np.arange(-5, 2, dtype=np.float),
                  np.arange(-4, 3, dtype=np.float),
                  np.arange(-2, 3, dtype=np.float)]

    bandits = [[generator(pow(2, param)) for _ in range(0, nBandits)] for generator, parameter in zip(generators, parameters) for param in parameter]
    _, averageRewards = banditSimulation(nBandits, time, bandits)
    rewards = np.sum(averageRewards, axis=1)/time

    global figureIndex
    plt.figure(figureIndex)
    figureIndex += 1
    i = 0
    for label, parameter in zip(labels, parameters):
        l = len(parameter)
        plt.plot(parameter, rewards[i:i+l], label=label)
        i += l
    plt.xlabel('Parameter(2^x)')
    plt.ylabel('Average reward')
    plt.legend()


figure2_1()
epsilonGreedy(2000, 1000)
optimisticInitialValues(2000, 1000)
ucb(2000, 1000)
gradientBandit(2000, 1000)

# This will take somehow a long time
figure2_6(2000, 1000)

plt.show()

在我個人看來，強化學習強調的是智能體在未知環境下完成設定的任務，根據環境中反饋的激勵信號來調整自身的動作策略。因此強化學習是一個環境與智能體交互，智能體根據交互信息調節自身動作策略的過程。在強化學習的代碼中，就仿真代碼而言，應該把環境對象和智能體對象來着分離，並利用面向過程的編程範式，編寫函數將兩者橋接起來。上述的代碼糅合在一起了，讓人一時間很難看懂。

下面我將根據上述代碼的思路，重新根據面向對象的編程範式，編寫新的代碼：
①爲環境創建一個類——Bandits
②爲智能體創建一個類——Agent
③交互的過程是（看simulation函數）：智能體選擇並執行動作（chooseAction），環境根據動作產生激勵信號（yieldReward），智能體根據反饋信號調整動作（updatePolicy）。上述不斷循環，從而使得智能體的策略不斷根據激勵信號被優化。

智能體應該具有的一些成員函數和成員變量爲：
①與環境相關的變量，可採取的動作類型（如賭博機的臂數）
②選擇動作函數，更新策略函數，保存策略函數和加載策略函數以及最後的智能體復位（回到最原始的狀態）

環境應該具有的成員函數和成員變量爲：
①根據問題需要用到的參數，每個動作的值
②最佳動作（如果是仿真環境下的話，可用於評估設計的算法的優劣），環境變化（能夠在一定範圍內變化），產生激勵信號（對智能體的動作有一定的反饋）

調整後的代碼如下所示：

# !/usr/bin/env python3
# -*- coding: utf-8 -*-

#######################################################################
# Copyright (C)                                                       #
# 2018 Dianye Huang ([email protected])
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

# 改程序參考github上的強化學習課程程序重新編寫，邏輯更加清晰。編寫環境和智能體兩個類進行交互，將環境的信息
# 與智能體的信息隔離，使智能體成爲單一的獨立個體，並能夠對智能體強化訓練得到的參數復位，重新進行訓練，以檢
# 冊算法的穩定性。

# stationary problem , 動作的價值是固定的，沒有變化，而反饋給智能體的價值是夾雜了噪音的，需要通過多次行爲來確定。
# bandit problem 是與當前狀態沒有關係的單步決策問題，不需要考慮上一時刻狀態。動作作用於環境，然後直接獲得reward
# non associative tasks， 動作的連續性和上下關聯不大，

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 環境部分 environment 負責接收動作action並反饋動作信息reward
class Bandit:
    def __init__(self, kArm=10, trueReward = 0):  # 設置相關初始變量
        self.kArm = kArm
        self.trueReward = trueReward  # 真實的每個動作對應的reward
        self.actionCount = 0  # 記錄動作次數
        self.time = 0  # 記錄運行時間
        # 根據輸入的單一的反饋值，自動生成對於各個臂的反饋值，用於驗證算法使用
        self.qTrue = []
        for i in range(0, self.kArm):
            self.qTrue.append(np.random.randn()+self.trueReward)

    def change(self):
        # 環境的改變即動作值的改變
        self.qTrue = []
        for i in range(0, self.kArm):
            self.qTrue.append(np.random.randn()+self.trueReward)

    def yieldReward(self, action): # 反饋獎勵
        self.time += 1
        return np.random.randn() + self.qTrue[action]

    def getBestAction(self):  # 上帝視角的最優動作
        return int(np.argmax(self.qTrue))

    def getkArms(self):  # 環境參數配置輸出，供agent使用，用於作爲智能體表現性能的參照指標
        return self.kArm

    def getTime(self):
        return self.time

    def showqTrue(self, figureIndex):
        plt.figure(figureIndex)
        print('qTrue:', self.qTrue)
        sns.violinplot(data=self.qTrue + np.random.randn(200, 10))
        plt.xlabel("Action")
        plt.ylabel("Reward distribution")

# 智能體部分 agent，他能夠執行的功能包括 ①選擇動作，②從環境中獲取reward，③調整動作策略， ④學習參數復位
# 使用動作值估計的方法使收益最大化
class Agent:
    def __init__(self, actionNum=10, method='SampleAverages',paraList=None):
        self.time = 0
        self.actionNum = actionNum
        self.currenAction = None
        self.method = method
        if self.method == 'SampleAverages':
            # para[0]->epsilon
            self.epsilon = paraList[0]
            self.qEst = np.zeros(actionNum)  # 估計每個動作的值, Est->estimate
            self.actionCount = np.zeros(actionNum)
        elif self.method == 'Incremental':
            # para[0]->epsilon; para[1]->step size
            self.epsilon = paraList[0]
            self.stepSize = paraList[1]
            self.qEst = np.zeros(actionNum)  # 估計每個動作的值, Est->estimate
        elif self.method == 'OptimisticInitial':  # 參數1 步長，參數2 reward初始值
            # para[0]->epsilon; para[1]->step size; para[2]-> optimistic values
            self.epsilon = paraList[0]
            self.stepSize = paraList[1]
            self.optimisticValue = paraList[2]
            self.qEst = np.zeros(actionNum) + self.optimisticValue # 估計每個動作的值, Est->estimate
        elif self.method == 'UCB': # Upper Confidance Bound method， action-value + explaoration factor
            # para[0]->step size; para[1]->c
            self.stepSize = paraList[0] # params: stepSize, optimistic value and c(control the degree of exploration)
            self.c = paraList[1]
            self.qEst = np.zeros(self.actionNum)
            self.actionCount = np.zeros(actionNum)
        elif self.method == 'Gradient':
            self.sum = 0
            self.Ht = np.zeros(self.actionNum)
            self.alpha = paraList[0]
            self.baseLine = paraList[1]
            self.averageReward = 0
            self.actionProb = np.zeros(self.actionNum)

    def chooseAction(self):
        self.time += 1  # 每個時間步執行一個動作
        if self.method == 'SampleAverages' or self.method == 'Incremental' or self.method == 'OptimisticInitial':
            # 屬於epsilon-greedy的策略，根據動作值進行動作的選擇
            # explore \ epsilon probability for exploration
            if self.epsilon > 0:
                if np.random.binomial(1, self.epsilon) == 1:
                    self.currenAction = np.random.choice(self.actionNum)  # 隨機返回一個動作
                    return self.currenAction
            # exploit -- greedy policy  1-epsilon probability for exploitation
            self.currenAction = int(np.argmax(self.qEst))  # 公式（2-2）的策略
        elif self.method == 'UCB':
            explrProb = self.c*np.sqrt(np.log(self.time)/(self.actionCount+1)) # 1 for the case of divided by zero
            self.currenAction = int(np.argmax(self.qEst+explrProb))
        elif self.method == 'Gradient':
            # 更新選擇動作的概率
            expEst = np.exp(self.Ht)
            self.actionProb = expEst / np.sum(expEst)  # soft-max function 公式(2.9)
            self.currenAction = np.random.choice(self.actionNum, p=self.actionProb)
        return self.currenAction

    def updatePolicy(self, reward):
        # 更新動作值的估計， qEst_update
        if   self.method == 'SampleAverages':
            self.actionCount[self.currenAction] += 1  # 統計執行的動作
            self.qEst[self.currenAction] += 1.0/self.actionCount[self.currenAction]*(reward - self.qEst[self.currenAction])  # 利用迭代的方法，可以不用去累加Reward，做一個簡單的推導即可 書本P21 公式（2.1）
        elif self.method == 'Incremental' or self.method == 'OptimisticInitial':
            self.qEst[self.currenAction] += self.stepSize*(reward-self.qEst[self.currenAction])  # exponential recency-weighted average
        elif self.method == 'UCB':  # optimistic initial values + incremental + ucb
            self.actionCount[self.currenAction] += 1
            self.qEst[self.currenAction] += self.stepSize * (reward - self.qEst[self.currenAction])
        elif self.method == 'Gradient':
            oneHot = np.zeros(self.actionNum)
            oneHot[self.currenAction] = 1
            if self.baseLine:
                self.averageReward += (reward - self.averageReward)/ float(self.time)  # 計算baseline期望均值
                self.Ht += self.alpha * (reward - self.averageReward) * (oneHot - self.actionProb)
            else:
                self.Ht += self.alpha * reward * (oneHot - self.actionProb)

    def reset(self):
        self.time = 0  # 重新復位策略後時間重新計算
        # 參數復位
        if self.method == 'SampleAverages':
            self.qEst = np.zeros(self.actionNum)
            self.actionCount = np.zeros(self.actionNum)  # numpy 包下的array數據類型的操作，python自帶的列表類型無法使用
        elif self.method == 'Incremental':
            self.qEst = np.zeros(self.actionNum)
        elif self.method == 'OptimisticInitial':  # 參數1 步長，參數2 reward初始值
            self.qEst = np.zeros(self.actionNum) + self.optimisticValue # 估計每個動作的值, Est->estimate
        elif self.method == 'UCB':
            self.qEst = np.zeros(self.actionNum)
            self.actionCount = np.zeros(self.actionNum)
        elif self.method == 'Gradient':
            self.averageReward = 0
            self.Ht = np.zeros(self.actionNum)
            self.actionProb = np.zeros(self.actionNum)

    def savePolicy(self):
        pass

    def loadPolicy(self):
        pass

# 開始仿真 並 記錄數據，整合連接智能體與環境之間的交互，記錄交互數據的仿真函數
def simulation(env, player, nBandits, time):
    # 智能體表現指標記錄變量
    bestCount = np.zeros(time)  # 最佳動作統計
    averageReward = np.zeros(time) # 平均反饋累加值
    # 開始進行n輪bandits， 每輪time個時間步
    for i in range(0, nBandits):
        env.change()  # 改變環境反饋的動作值，在一定範圍內改變
        for t in range(time):
            # agent與環境交互和調整動作策略的過程
            action = player.chooseAction()      # player選擇動作
            reward = env.yieldReward(action)    # 環境根據動作返回獎勵
            player.updatePolicy(reward)         # player根據當前動作得到的reward調整策略，更新動作值
            # 統計記錄策略調整的效果的響應指標
            averageReward[t] += reward
            if action == env.getBestAction():   # 當前的決策與環境最好的動作相等時，累計加1
                bestCount[t] += 1
        # 重置智能體策略，用於評估算法的穩定性
        player.reset()  # 從0到time 訓練完畢後reset policy進行下一次的訓練，查看訓練效果，從而估計算法的平均性能
    # 求解平均性能
    averageReward /= nBandits
    bestCount /= nBandits
    return averageReward, bestCount


# 用於畫圖使用
# 畫圖主要畫兩種圖，①平均獎勵值， ②最優動作概率
class Plot:
    def __init__(self):
        pass

    def plotting(self, data, figureIndex, labelStr, xStr, yStr):
        plt.figure(figureIndex)  # 設置畫圖標號
        plt.plot(data, label=labelStr)  # 繪製數據圖表，設置圖標
        plt.xlabel(xStr)  # x軸標籤
        plt.ylabel(yStr)  # y軸標籤
        plt.legend()     # 打開圖例

####################################################################################################
if __name__ == '__main__':
    # 繪圖對象實例化
    plot = Plot()

    # 交互對象實例化
    env = Bandit(10, 4)  # 實例化環境對象
    env.showqTrue(figureIndex=1)  # 顯示reward參數分佈

    # --------------- 智能體1 仿真-------------
    # 特定的智能體的實例化
    player = Agent(env.kArm, method='Gradient', paraList=[0.1, False])
    # 開始仿真並返回待記錄數據
    avgReward, bestCount = simulation(env, player, 2000, 1000)
    # 繪製相關圖表
    plot.plotting(avgReward, 2, 'Gradient (alpha=0.1, without baseline)', 'Steps', 'Average Reward')
    plot.plotting(bestCount, 3, 'Gradient (alpha=0.1, without baseline)', 'Steps', 'Optimal Action')

    # ---------------- 智能體2 仿真-------------
    player = Agent(env.kArm, method='Gradient', paraList=[0.1, True])
    avgReward, bestCount = simulation(env, player, 2000, 1000)
    plot.plotting(avgReward, 2, 'Gradient (alpha=0.1, with baseline)', 'Steps', 'Average Reward')
    plot.plotting(bestCount, 3, 'Gradient (alpha=0.1, with baseline)', 'Steps', 'Optimal Action')

    # ---------------- 智能體3 仿真-------------
    player = Agent(env.kArm, method='Gradient', paraList=[0.4, False])
    avgReward, bestCount = simulation(env, player, 2000, 1000)
    plot.plotting(avgReward, 2, 'Gradient (alpha=0.4, without baseline)', 'Steps', 'Average Reward')
    plot.plotting(bestCount, 3, 'Gradient (alpha=0.4, without baseline)', 'Steps', 'Optimal Action')

    # ---------------- 智能體4 仿真-------------
    player = Agent(env.kArm, method='Gradient', paraList=[0.4, True])
    avgReward, bestCount = simulation(env, player, 2000, 1000)
    plot.plotting(avgReward, 2, 'Gradient (alpha=0.4, with baseline)', 'Steps', 'Average Reward')
    plot.plotting(bestCount, 3, 'Gradient (alpha=0.4, with baseline)', 'Steps', 'Optimal Action')

    # 打開繪製圖表開關
    plt.show()

輸出結果如下：
與書本上的圖2.5結果一致：

Reinforcement Learning: an introduction 編程筆記——第二章

關於磁力計偏置值的標定實踐

關於線性卡爾曼濾波的應用

關於英語詞彙的記憶

二次規劃——學習筆記2

rqt_plot工具——ROS中查看變量時間趨勢線

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結