強化學習經典算法筆記(九):LSTM加持的PolicyGradient算法
在上文《強化學習經典算法筆記(八):LSTM加持的A2C算法解決POMDP問題》的基礎上,實現了LSTM+MLP的Policy Gradient算法。
實現過程如下:
import argparse, math, os, sys
import numpy as np
import gym
from gym import wrappers
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn.utils as utils
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
plt.ion()
parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
parser.add_argument('--env_name', type=str, default='LunarLanderContinuous-v2')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='discount factor for reward (default: 0.99)')
parser.add_argument('--seed', type=int, default=123, metavar='N', # 隨機數種子
help='random seed (default: 123)')
parser.add_argument('--num_steps', type=int, default=1000, metavar='N', # 一個episode最長持續幀數
help='max episode length (default: 1000)')
parser.add_argument('--num_episodes', type=int, default=2000, metavar='N', # 訓練episode數量
help='number of episodes (default: 2000)')
parser.add_argument('--hidden_size', type=int, default=128, metavar='N', # 神經網絡隱層神經元數量
help='number of episodes (default: 128)')
parser.add_argument('--render', action='store_true',
help='render the environment')
parser.add_argument('--ckpt_freq', type=int, default=100,
help='model saving frequency')
parser.add_argument('--display', type=bool, default=False,
help='display or not')
args = parser.parse_args()
env_name = args.env_name # 遊戲名
env = gym.make(env_name) # 創建環境
if args.display:
env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)
env.seed(args.seed) # 隨機數種子
torch.manual_seed(args.seed) # Gym、numpy、Pytorch都要設置隨機數種子
np.random.seed(args.seed)
class Policy(nn.Module): # 神經網絡定義的策略
def __init__(self, hidden_size, num_inputs, action_space):
super(Policy, self).__init__()
self.action_space = action_space # 動作空間
num_outputs = action_space.shape[0] # 動作空間的維度
self.lstm = nn.LSTM(num_inputs, hidden_size, batch_first = True)
self.linear1 = nn.Linear(hidden_size, hidden_size) # 隱層神經元數量
self.linear2 = nn.Linear(hidden_size, num_outputs)
self.linear2_ = nn.Linear(hidden_size, num_outputs)
def forward(self, x, hidden):
x, hidden = self.lstm(x, hidden)
x = F.relu(self.linear1(x))
mu = self.linear2(x) # 爲了輸出連續域動作,實際上policy net定義了
sigma_sq = self.linear2_(x) # 一個多維高斯分佈,維度=動作空間的維度
return mu, sigma_sq, hidden
class REINFORCE:
def __init__(self, hidden_size, num_inputs, action_space):
self.action_space = action_space
self.model = Policy(hidden_size, num_inputs, action_space) # 創建策略網絡
# self.model = self.model.cuda() # GPU版本
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3) # 優化器
self.model.train()
self.pi = Variable(torch.FloatTensor([math.pi])) # .cuda() # 圓周率
def normal(self, x, mu, sigma_sq): # 計算動作x在policy net定義的高斯分佈中的概率值
a = ( -1 * (Variable(x)-mu).pow(2) / (2*sigma_sq) ).exp()
b = 1 / ( 2 * sigma_sq * self.pi.expand_as(sigma_sq) ).sqrt() # pi.expand_as(sigma_sq)的意義是將標量π擴展爲與sigma_sq同樣的維度
return a*b
def select_action(self, state, hx, cx):
# mu, sigma_sq = self.model(Variable(state).cuda())
mu, sigma_sq, (hx,cx) = self.model(Variable(state),(hx,cx))
sigma_sq = F.softplus(sigma_sq)
eps = torch.randn(mu.size()) # 產生一個與動作向量維度相同的標準正態分佈隨機向量
# action = (mu + sigma_sq.sqrt()*Variable(eps).cuda()).data
action = (mu + sigma_sq.sqrt()*Variable(eps)).data # 相當於從N(μ,σ²)中採樣一個動作
prob = self.normal(action, mu, sigma_sq) # 計算動作概率
entropy = -0.5*( ( sigma_sq + 2 * self.pi.expand_as(sigma_sq) ).log()+1 ) # 高斯分佈的信息熵,參考https://blog.csdn.net/raby_gyl/article/details/73477043
log_prob = prob.log() # 對數概率
return action, log_prob, entropy, hx,cx
def update_parameters(self, rewards, log_probs, entropies, gamma):# 更新參數
R = torch.zeros(1, 1)
loss = 0
for i in reversed(range(len(rewards))):
R = gamma * R + rewards[i] # 倒序計算累計期望
# loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).cuda()).sum() - (0.0001*entropies[i].cuda()).sum()
loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i]))).sum() - (0.0001*entropies[i]).sum()
loss = loss / len(rewards)
self.optimizer.zero_grad()
loss.backward()
utils.clip_grad_norm_(self.model.parameters(), 40) # 梯度裁剪,梯度的最大L2範數=40
self.optimizer.step()
agent = REINFORCE(args.hidden_size, env.observation_space.shape[0], env.action_space)
dir = 'ckpt_' + env_name
if not os.path.exists(dir):
os.mkdir(dir)
log_reward = []
log_smooth = []
for i_episode in range(args.num_episodes):
state = torch.Tensor([env.reset()])
entropies = []
log_probs = []
rewards = []
hx = torch.zeros(args.hidden_size).unsqueeze(0).unsqueeze(0); # 初始化隱狀態
cx = torch.zeros(args.hidden_size).unsqueeze(0).unsqueeze(0);
# print(hx.shape)
for t in range(args.num_steps): # 1個episode最長num_steps
# print(state.shape)
action, log_prob, entropy, hx, cx = agent.select_action(state.unsqueeze(0),hx,cx)
action = action.cpu()
next_state, reward, done, _ = env.step(action.numpy()[0,0])
entropies.append(entropy)
log_probs.append(log_prob)
rewards.append(reward)
state = torch.Tensor([next_state])
if done:
break
# episode結束,開始訓練
agent.update_parameters(rewards, log_probs, entropies, args.gamma)
if i_episode%args.ckpt_freq == 0:
torch.save(agent.model.state_dict(), os.path.join(dir, 'reinforce-'+str(i_episode)+'.pkl'))
print("Episode: {}, reward: {}".format(i_episode, np.sum(rewards)))
log_reward.append(np.sum(rewards))
if i_episode == 0:
log_smooth.append(log_reward[-1])
else:
log_smooth.append(log_smooth[-1]*0.99+0.01*np.sum(rewards))
plt.plot(log_reward)
plt.plot(log_smooth)
plt.pause(1e-5)
env.close()
在LunarLanderContinuous-v2環境中,使用本文算法和vanilla PG算法進行訓練,結果如下,優勢明顯。
PG+LSTM
vanilla PG