一、PPO主體
1、主結構
PPO主體主要分爲兩個部分,初始化部分init用來設定網絡的一些超參數,以及構建網絡,第二部分train則用於更新網絡參數(實際代碼中,該PPO主體繼承自另外一個主要用於設定超參數的類)。
class PPO():
def __init__(...):
pass
def train(self, states, actions, advantages, logp_olds, returns):
pass
2、初始化部分
根據動作類型選取合適的網絡模型,關於不同網絡模型的代碼實現參考上一篇文章
def __init__(
self,
state_shape,
action_dim,
is_discrete,
max_action=1.,
actor_units=[256, 256],
critic_units=[256, 256],
lr_actor=1e-3,
lr_critic=3e-3,
const_std=0.3,
hidden_activation_actor="relu",
hidden_activation_critic="relu",
clip_ratio=0.2,
name="PPO",
**kwargs):
super().__init__(name=name, **kwargs)
self.clip_ratio = clip_ratio
self._is_discrete = is_discrete
# 創建網絡模型
if is_discrete:
self.actor = CategoricalActor(
state_shape, action_dim, actor_units)
else:
self.actor = GaussianActor(
state_shape, action_dim, max_action, actor_units,
hidden_activation=hidden_activation_actor,
const_std=const_std)
self.critic = CriticV(state_shape, critic_units,
hidden_activation=hidden_activation_critic)
# 創建優化器
self.actor_optimizer = tf.keras.optimizers.Adam(
learning_rate=lr_actor)
self.critic_optimizer = tf.keras.optimizers.Adam(
learning_rate=lr_critic)
# This is used to check if input state to `get_action` is multiple (batch) or single
self._state_ndim = np.array(state_shape).shape[0]
3、訓練部分
a、訓練actor
因爲這裏actor跟critic分開兩個網絡進行,不共享網絡參數,因此將value_loss獨立開單獨進行梯度計算,因此損失函數用以下公式表示(取0.01):
代碼如下:
@tf.function
def _train_actor_body(self, states, actions, advantages, logp_olds):
with tf.device(self.device):
# Update actor
with tf.GradientTape() as tape:
# 計算熵
ent = tf.reduce_mean(
self.actor.compute_entropy(states))
if self.clip:
# 計算新策略的概率
logp_news = self.actor.compute_log_probs(
states, actions)
# 計算概率比例
ratio = tf.math.exp(logp_news - tf.squeeze(logp_olds))
# 對比例進行裁剪
min_adv = tf.clip_by_value(
ratio,
1.0 - self.clip_ratio,
1.0 + self.clip_ratio) * tf.squeeze(advantages)
# loss = (l_clip + entropy)
actor_loss = -tf.reduce_mean(tf.minimum(
ratio * tf.squeeze(advantages),
min_adv))
actor_loss -= self.entropy_coef * ent
else:
raise NotImplementedError
actor_grad = tape.gradient(
actor_loss, self.actor.trainable_variables)
self.actor_optimizer.apply_gradients(
zip(actor_grad, self.actor.trainable_variables))
return actor_loss, logp_news, ratio, ent
熵值的計算:
def compute_entropy(self, state):
param = self._compute_dist(states)
log_stds = param["log_std"]
return tf.reduce_sum(log_stds + tf.math.log(tf.math.sqrt(2 * np.pi * np.e)), axis=-1)
b、訓練critic
損失函數如下(這裏取0.5):
其中T指的是該序列的長度,代碼如下:
@tf.function
def _train_critic_body(self, states, returns):
with tf.device(self.device):
# Train baseline
with tf.GradientTape() as tape:
current_V = self.critic(states)
td_errors = tf.squeeze(returns) - current_V
critic_loss = tf.reduce_mean(0.5 * tf.square(td_errors))
critic_grad = tape.gradient(
critic_loss, self.critic.trainable_variables)
self.critic_optimizer.apply_gradients(
zip(critic_grad, self.critic.trainable_variables))
return critic_loss
二、環境交互
1、 交互部分主結構
class OnPolicyTrainer(object):
def __init__(self,...):
'''
初始化訓練參數,導入環境,policy等
'''
pass
def __call__(self):
'''
主循環,採集數據,更新網絡
'''
pass
def finish_horizon(self, last_val=0):
'''
每一個序列T採集完的時候調用
用於計算adv,存儲buffer
'''
pass
def evaluate_policy(self, total_steps):
'''
用於檢驗決策模型得分
'''
pass
def _set_from_args(self, args):
'''
設置參數
'''
pass
@staticmethod
def get_argument(parser=None):
'''
獲取參數
'''
pass
2、初始化部分
def __init__(self, policy,
env,
args,
test_env=None):
self._set_from_args(args)
self._policy = policy
self._env = env
self._test_env = self._env if test_env is None else test_env
# 正則化狀態
# obs-mean/(var+ 1e8)
if self._normalize_obs:
self._env = NormalizeObsEnv(self._env)
self._test_env = NormalizeObsEnv(self._test_env)
...
# 省略部分用於監測數據的代碼
...
3、調用
ppo2算法裏面,規定序列長度,即每一輪的最大步數,當步數達到最大值或者該輪結束時,通過以下式子進行網絡更新,其中
buffer的存儲調用利用cpprb庫提供的api實現。回調函數如下
def __call__(self):
# 準備每一輪更新用的buffer
# Prepare buffer
self.replay_buffer = get_replay_buffer(
self._policy, self._env)
kwargs_local_buf = get_default_rb_dict(
size=self._policy.horizon, env=self._env)
kwargs_local_buf["env_dict"]["logp"] = {}
kwargs_local_buf["env_dict"]["val"] = {}
if is_discrete(self._env.action_space):
kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
self.local_buffer = ReplayBuffer(**kwargs_local_buf)
episode_steps = 0
episode_return = 0
episode_start_time = time.time()
total_steps = np.array(0, dtype=np.int32)
n_epoisode = 0
obs = self._env.reset()
tf.summary.experimental.set_step(total_steps)
while total_steps < self._max_steps:
# Collect samples
for _ in range(self._policy.horizon):
act, logp, val = self._policy.get_action_and_val(obs)
next_obs, reward, done, _ = self._env.step(act)
episode_steps += 1
total_steps += 1
episode_return += reward
done_flag = done
if hasattr(self._env, "_max_episode_steps") and \
episode_steps == self._env._max_episode_steps:
done_flag = False
self.local_buffer.add(
obs=obs, act=act, next_obs=next_obs,
rew=reward, done=done_flag, logp=logp, val=val)
obs = next_obs
if done or episode_steps == self._episode_max_steps:
tf.summary.experimental.set_step(total_steps)
self.finish_horizon()
obs = self._env.reset()
n_epoisode += 1
fps = episode_steps / (time.time() - episode_start_time)
self.logger.info(
"Total Epi: {0: 5} Steps: {1: 7} Episode Steps: {2: 5} Return: {3: 5.4f} FPS: {4:5.2f}".format(
n_epoisode, int(total_steps), episode_steps, episode_return, fps))
episode_steps = 0
episode_return = 0
episode_start_time = time.time()
self.finish_horizon(last_val=val)
tf.summary.experimental.set_step(total_steps)
# 更新參數
if self._policy.normalize_adv:
samples = self.replay_buffer._encode_sample(np.arange(self._policy.horizon))
mean_adv = np.mean(samples["adv"])
std_adv = np.std(samples["adv"])
with tf.summary.record_if(total_steps % self._save_summary_interval == 0):
for _ in range(self._policy.n_epoch):
samples = self.replay_buffer._encode_sample(
np.random.permutation(self._policy.horizon))
if self._policy.normalize_adv:
adv = (samples["adv"] - mean_adv) / (std_adv + 1e-8)
else:
adv = samples["adv"]
for idx in range(int(self._policy.horizon / self._policy.batch_size)):
target = slice(idx * self._policy.batch_size,
(idx + 1) * self._policy.batch_size)
self._policy.train(
states=samples["obs"][target],
actions=samples["act"][target],
advantages=adv[target],
logp_olds=samples["logp"][target],
returns=samples["ret"][target])
4、計算adv
在ppo2裏面,優勢值通過以下方式計算:
其中,
def finish_horizon(self, last_val=0):
samples = self.local_buffer._encode_sample(
np.arange(self.local_buffer.get_stored_size()))
rews = np.append(samples["rew"], last_val)
vals = np.append(samples["val"], last_val)
# GAE-Lambda advantage calculation
deltas = rews[:-1] + self._policy.discount * vals[1:] - vals[:-1]
if self._policy.enable_gae:
advs = discount_cumsum(
deltas, self._policy.discount * self._policy.lam)
else:
advs = deltas
# Rewards-to-go, to be targets for the value function
rets = discount_cumsum(rews, self._policy.discount)[:-1]
self.replay_buffer.add(
obs=samples["obs"], act=samples["act"], done=samples["done"],
ret=rets, adv=advs, logp=np.squeeze(samples["logp"]))
self.local_buffer.clear()
其中,discount_cumsum函數用以下方式實現
def discount_cumsum(x, discount):
"""
Forked from rllab for computing discounted cumulative sums of vectors.
:param x (np.ndarray or tf.Tensor)
vector of [x0, x1, x2]
:return output:
[x0 + discount * x1 + discount^2 * x2,
x1 + discount * x2,
x2]
"""
return lfilter(
b=[1],
a=[1, float(-discount)],
x=x[::-1],
axis=0)[::-1]
5、檢驗函數
def evaluate_policy(self, total_steps):
if self._normalize_obs:
self._test_env.normalizer.set_params(
*self._env.normalizer.get_params())
avg_test_return = 0.
if self._save_test_path:
replay_buffer = get_replay_buffer(
self._policy, self._test_env, size=self._episode_max_steps)
for i in range(self._test_episodes):
episode_return = 0.
frames = []
obs = self._test_env.reset()
for _ in range(self._episode_max_steps):
act, _ = self._policy.get_action(obs, test=True)
act = act if not hasattr(self._env.action_space, "high") else \
np.clip(act, self._env.action_space.low, self._env.action_space.high)
next_obs, reward, done, _ = self._test_env.step(act)
if self._save_test_path:
replay_buffer.add(
obs=obs, act=act, next_obs=next_obs,
rew=reward, done=done)
episode_return += reward
obs = next_obs
if done:
break
prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
total_steps, i, episode_return)
return avg_test_return / self._test_episodes
三、 run_ppo
導入相關模塊。utils主要涵蓋了一些瑣碎的功能,例如跟環境相關的。
import tensorflow as tf
from ppo import PPO
from on_policy_trainer import OnPolicyTrainer
from utils import is_discrete, get_act_dim
主程序,先從trainer那裏獲取默認的超參數,然後設定跟訓練集測試集相關的參數。ppo算法的是隨機連續決策算法,根據openAI官方推薦,其模型輸出的方差不是一個函數並且與環境無關。
官方說法:
There is a single vector of log standard deviations, , which is not a function of state: the are standalone parameters. (You Should Know: our implementations of VPG, TRPO, and PPO do it this way.)
if __name__ == '__main__':
parser = OnPolicyTrainer.get_argument()
parser = PPO.get_argument(parser)
parser.add_argument('--env-name', type=str,
default="Pendulum-v0")
parser.set_defaults(test_interval=20480)
parser.set_defaults(max_steps=int(1e7))
parser.set_defaults(horizon=2048)
parser.set_defaults(batch_size=64)
parser.set_defaults(gpu=-1)
parser.set_defaults(episode_max_steps=200)
args = parser.parse_args()
env = gym.make(args.env_name)
test_env = gym.make(args.env_name)
policy = PPO(
state_shape=env.observation_space.shape,
action_dim=get_act_dim(env.action_space),
is_discrete=is_discrete(env.action_space),
max_action=None if is_discrete(
env.action_space) else env.action_space.high[0],
batch_size=args.batch_size,
actor_units=[128, 64],
critic_units=[128, 64],
n_epoch=10,
n_epoch_critic=10,
lr_actor=3e-4,
lr_critic=3e-4,
discount=0.99,
lam=0.95,
hidden_activation=tf.nn.relu,
horizon=args.horizon,
normalize_adv=args.normalize_adv,
enable_gae=args.enable_gae,
gpu=args.gpu)
trainer = OnPolicyTrainer(policy, env, args)
trainer()
最後來看下結果,大約在400k步的時候就開始收斂了,如果想收斂地更快可以自己嘗試一下調整參數