強化學習經典算法筆記(九):LSTM加持的PolicyGradient算法

強化學習經典算法筆記(九):LSTM加持的PolicyGradient算法

在上文《強化學習經典算法筆記(八):LSTM加持的A2C算法解決POMDP問題》的基礎上,實現了LSTM+MLP的Policy Gradient算法。
實現過程如下:

import argparse, math, os, sys
import numpy as np
import gym
from gym import wrappers
import matplotlib.pyplot as plt

import torch
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn.utils as utils

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

plt.ion()
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env_name', type=str, default='LunarLanderContinuous-v2')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                    help='discount factor for reward (default: 0.99)')
parser.add_argument('--seed', type=int, default=123, metavar='N',             # 隨機數種子
                    help='random seed (default: 123)')
parser.add_argument('--num_steps', type=int, default=1000, metavar='N',       # 一個episode最長持續幀數
                    help='max episode length (default: 1000)')
parser.add_argument('--num_episodes', type=int, default=2000, metavar='N',    # 訓練episode數量
                    help='number of episodes (default: 2000)')
parser.add_argument('--hidden_size', type=int, default=128, metavar='N',      # 神經網絡隱層神經元數量
                    help='number of episodes (default: 128)')
parser.add_argument('--render', action='store_true',
                    help='render the environment')
parser.add_argument('--ckpt_freq', type=int, default=100, 
                    help='model saving frequency')
parser.add_argument('--display', type=bool, default=False,
                    help='display or not')
args = parser.parse_args()
env_name = args.env_name                                            # 遊戲名
env = gym.make(env_name)                                            # 創建環境

if args.display:
    env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)

env.seed(args.seed)                                                 # 隨機數種子
torch.manual_seed(args.seed)                                        # Gym、numpy、Pytorch都要設置隨機數種子
np.random.seed(args.seed)

class Policy(nn.Module):                                            # 神經網絡定義的策略
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Policy, self).__init__()
        self.action_space = action_space                            # 動作空間
        num_outputs = action_space.shape[0]                         # 動作空間的維度

        self.lstm = nn.LSTM(num_inputs, hidden_size, batch_first = True)
        self.linear1 = nn.Linear(hidden_size, hidden_size)           # 隱層神經元數量
        self.linear2 = nn.Linear(hidden_size, num_outputs)
        self.linear2_ = nn.Linear(hidden_size, num_outputs)

    def forward(self, x, hidden):
        x, hidden = self.lstm(x, hidden)
        x = F.relu(self.linear1(x))
        mu = self.linear2(x)                                        # 爲了輸出連續域動作,實際上policy net定義了
        sigma_sq = self.linear2_(x)                                 # 一個多維高斯分佈,維度=動作空間的維度

        return mu, sigma_sq, hidden
class REINFORCE:
    def __init__(self, hidden_size, num_inputs, action_space):
        self.action_space = action_space
        self.model = Policy(hidden_size, num_inputs, action_space)    # 創建策略網絡
        # self.model = self.model.cuda()                              # GPU版本
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3) # 優化器
        self.model.train()
        self.pi = Variable(torch.FloatTensor([math.pi])) # .cuda()    # 圓周率

    def normal(self, x, mu, sigma_sq):                                # 計算動作x在policy net定義的高斯分佈中的概率值
        a = ( -1 * (Variable(x)-mu).pow(2) / (2*sigma_sq) ).exp()
        b = 1 / ( 2 * sigma_sq * self.pi.expand_as(sigma_sq) ).sqrt()      # pi.expand_as(sigma_sq)的意義是將標量π擴展爲與sigma_sq同樣的維度
        return a*b

    def select_action(self, state, hx, cx):

        # mu, sigma_sq = self.model(Variable(state).cuda())
        mu, sigma_sq, (hx,cx) = self.model(Variable(state),(hx,cx))
        sigma_sq = F.softplus(sigma_sq)

        eps = torch.randn(mu.size())                                  # 產生一個與動作向量維度相同的標準正態分佈隨機向量
        # action = (mu + sigma_sq.sqrt()*Variable(eps).cuda()).data
        action = (mu + sigma_sq.sqrt()*Variable(eps)).data            # 相當於從N(μ,σ²)中採樣一個動作
        prob = self.normal(action, mu, sigma_sq)                      # 計算動作概率
        entropy = -0.5*( ( sigma_sq + 2 * self.pi.expand_as(sigma_sq) ).log()+1 ) # 高斯分佈的信息熵,參考https://blog.csdn.net/raby_gyl/article/details/73477043

        log_prob = prob.log()                                         # 對數概率
        return action, log_prob, entropy, hx,cx

    def update_parameters(self, rewards, log_probs, entropies, gamma):# 更新參數
        R = torch.zeros(1, 1)
        loss = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]                                # 倒序計算累計期望
            # loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
            loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i]))).sum() - (0.0001*entropies[i]).sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_norm_(self.model.parameters(), 40)             # 梯度裁剪,梯度的最大L2範數=40
        self.optimizer.step()

agent = REINFORCE(args.hidden_size, env.observation_space.shape[0], env.action_space)

dir = 'ckpt_' + env_name
if not os.path.exists(dir):    
    os.mkdir(dir)

log_reward = []
log_smooth = []
for i_episode in range(args.num_episodes):
    state = torch.Tensor([env.reset()])
    entropies = []
    log_probs = []
    rewards = []
    hx = torch.zeros(args.hidden_size).unsqueeze(0).unsqueeze(0); # 初始化隱狀態
    cx = torch.zeros(args.hidden_size).unsqueeze(0).unsqueeze(0);
    # print(hx.shape)
    for t in range(args.num_steps): # 1個episode最長num_steps
        # print(state.shape)
        action, log_prob, entropy, hx, cx = agent.select_action(state.unsqueeze(0),hx,cx)
        action = action.cpu()

        next_state, reward, done, _ = env.step(action.numpy()[0,0])

        entropies.append(entropy)
        log_probs.append(log_prob)
        rewards.append(reward)
        state = torch.Tensor([next_state])

        if done:
            break
    # episode結束,開始訓練
    agent.update_parameters(rewards, log_probs, entropies, args.gamma)

    if i_episode%args.ckpt_freq == 0:
        torch.save(agent.model.state_dict(), os.path.join(dir, 'reinforce-'+str(i_episode)+'.pkl'))

    print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
    log_reward.append(np.sum(rewards))
    if i_episode == 0:
        log_smooth.append(log_reward[-1])
    else:
        log_smooth.append(log_smooth[-1]*0.99+0.01*np.sum(rewards))
    
    
    plt.plot(log_reward)
    plt.plot(log_smooth)
    plt.pause(1e-5)
env.close()

在LunarLanderContinuous-v2環境中,使用本文算法和vanilla PG算法進行訓練,結果如下,優勢明顯。
PG+LSTM
在這裏插入圖片描述
vanilla PG
在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章