強化學習,直接策略搜索,策略梯度,入門樣例

策略梯度,入門樣例

原文鏈接:
https://www.cnblogs.com/Twobox/
參考鏈接:

https://datawhalechina.github.io/easy-rl/#/chapter4/chapter4

https://zhuanlan.zhihu.com/p/358700228

策略網路結構
image-20220911141445824
算法流程與策略梯度

添加一個基線

調整更合適的分數

代碼結構

需要的包

import numpy as np
import gym
import matplotlib.pyplot as plt
import torch  # torch.optim.SGD 內置優化器
import torch.nn as nn  # 模型庫
import torch.nn.functional as F  # 內置loss函數
from torch.utils.data import TensorDataset  # 包裝
from torch.utils.data import DataLoader  # 迭代器

model.py

def loss_fun(p, advantage, N):
    # p就是p(a|s)  advantage 就是權重優勢
    # p Tensor格式  advantage爲數字數組1

    advantage = torch.Tensor(advantage)
    # 目標函數 1/N sum(sum(a' * log p'))
    loss = -torch.sum(torch.log(p) * advantage) / N
    return loss


class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = nn.Linear(4, 128)
        self.linear2 = nn.Linear(128, 2)
        # self.linear3 = nn.Linear(20, 2)

    def forward(self, x):
        # xb = xb.view(xb.size(0), -1)
        out = self.linear1(x)
        out = F.relu(out)
        out = self.linear2(out)
        out = F.softmax(out, dim=-1)
        return out

    def fit(self, p, advantage, N):
        opt = torch.optim.Adam(self.parameters(), 0.005)
        loss = loss_fun(p, advantage, N)
        opt.zero_grad()
        loss.backward()
        opt.step()
        opt.zero_grad()

agent.py

class Agent:
    def __init__(self, gamma):
        self.model = Model()

        # 目標函數 1/N sum(sum(a' * log p'))
        self.p = []
        self.advantage = []
        self.N = 0
        self.gamma = gamma

    def get_action_p(self, state):
        # 轉化爲Tensor , 此時爲一維
        state = torch.FloatTensor(state)
        # 轉化爲二維,最外面加個[]
        state = torch.unsqueeze(state, 0)

        p = self.model(state)
        return p  # tensor

    def clear(self):
        self.advantage.clear()
        self.p.clear()
        self.N = 0

    def pay_n_times(self, N, env):
        # 玩N次,追加存儲N次經驗
        self.N += N
        r_sum = 0  # 所有獎勵
        advantage = []

        for n in range(N):
            state = env.reset()
            r_list = []  # 一個回合 每個動作的獎勵
            done = False
            while not done:
                p = self.get_action_p(state)
                # 按概率採樣下表;在dim爲1的位置進行採樣;這裏的結果爲[[0 or 1]]
                action = torch.multinomial(p, 1).item()  # 這時候直接是數字
                s_, r, done, _ = env.step(action)
                state = s_
                r_list.append(r)
                # 後續需要對self.p使用torch.cat方法
                self.p.append(p[0][action].unsqueeze(0))  

            r_sum += sum(r_list)
            # sum(gamma^i * r)
            ad_list = []
            ad_temp = 0
            for i in reversed(range(len(r_list))):
                ad_temp = ad_temp * self.gamma + r_list[i]
                ad_list.append(ad_temp)

            ad_list.reverse()
            advantage += ad_list

        b = r_sum / N
        advantage = [a - b for a in advantage]
        self.advantage += advantage

        # 返回平均分數
        return b

    def learn(self):
        p = torch.cat(self.p)
        advantage = torch.FloatTensor(self.advantage)
        self.model.fit(p, advantage, self.N)

main.py

env = gym.make("CartPole-v1")
agent = Agent(0.95)

T = 1000 # 更新多少次梯度
N = 50 # 每次跟新需要採樣多少回合的經驗
x, y = [], []
for t in range(T):
    avg_r = agent.pay_n_times(N, env)
    x.append(t)
    y.append(avg_r)
    print("{} : {}".format(t, avg_r))
    agent.learn()
    agent.clear()

    plt.plot(x,y)
    plt.pause(0.1)

plt.plot(x,y)
plt.show()

結果

本文原創作者:魏雄
原文鏈接:
https://www.cnblogs.com/Twobox/

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章