搭建DQN
初始化
#動作數量
self.n_actions
#狀態數量
self.n_features
#learning_rate學習速率
self.lr
#Q-learning中reward衰減因子
self.gamma
#e-greedy的選擇概率最大值
self.epsilon_max
#更新Q現實網絡參數的步驟數
self.replace_target_iter
#存儲記憶的數量
self.memory_size
#每次從記憶庫中取的樣本數量
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
#學習的步驟
self.learn_step_counter
#記憶庫,此刻的n_feature + 下一步的n_feature + reward + action
self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
#利用Q目標的參數替換Q估計中的參數
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
#生成了一個tensorflow操作列表[tf.assign(t1,e1), tf.assign(t2,e2), tf.assign(t3,e3)]
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
構建神經網絡
構造Q估計神經網絡
def _build_net(self):
#輸入
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
#Q現實輸入
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
with tf.variable_scope('eval_net'):
#collection
c_names = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
#神經元數量
n_l1 = 10
#權值
w_initializer = tf.random_normal_initializer(0., 0.3)
#偏置
b_initializer = tf.constant_initializer(0.1)
#第一層神經元
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
#第二層神經元
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_eval = tf.matmul(l1, w2) + b2
#基於Q估計與Q現實,構造loss-function
with tf.variable_scope('loss'):
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
#訓練
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
構造Q現實神經網絡(該段代碼緊接着上段,屬於_build_net()函數)
#輸入
self.s_sub = tf.placeholder(tf.float32, [None, self.n_features], name='s_sub')
with tf.variable_scope('target_net'):
#collection
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
#第一層神經元
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)
#第二層神經元
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.q_next = tf.matmul(l1, w2) + b2
存儲狀態信息
def store_transition(self, s, a, r, s_):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
#狀態信息list ==> [x, y]
#[action, reward]動作與獎勵信息合併爲list
#下一步狀態信息 ==> [x_next, y_next]
transition = np.hstack((s, [a, r], s_))
#hstack的結果爲 ==> [x, y, a, r, x_next, y_next]
#每過memory_size,替換存儲值
index = self.memory_counter % self.memory_size
#memory爲二維列表,transition爲一行向量,插入index行中
self.memory[index, :] = transition
self.memory_counter += 1
選擇動作action
def choose_action(self, observation):
# 將observation的list[x, y]轉爲行向量[[x, y]]
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# 得到每個action的q的估計值
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
# 選擇q值最大的action
action = np.argmax(actions_value)
else:
action = np.random.randint(0, self.n_actions)
return action
增強學習過程
def learn(self):
#更換參數
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
#從memory中抽取一個記憶值,一個行向量
#[x, y, a, r, x_next, y_next]
batch_memory = self.memory[sample_index, :]
q_next, q_eval = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={
self.s_: batch_memory[:, -self.n_features:], # fixed params
self.s: batch_memory[:, :self.n_features], # newest params
})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
#訓練網絡
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
# increasing epsilon
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
舉例說明上述過程
數據結構
- action=3
- n_feature=2
- batch_size=2
q-eval結構
action_0 | action_1 | action_2 |
---|---|---|
1 | 2 | 1 |
2 | 3 | 2 |
行:每一個樣本
列:每一個action對應的Q值
q-next,q-target與q-eval結構相同
batch-index樣本索引
一維list ==> [0, 1] #長度:bactch_size
eval_act_index每個樣本對應的action的值,也就是每個樣本列的索引
一維list ==> [1, 0]
reward每個樣本對應的reward的值
一維list ==> [1, 2]
過程
- 將q-eval的值賦給q-target
- 利用Q-learning算法,計算每一個樣本的對應action的q值
- 樣本0,採取了action=0,真實的q值爲-1
- 樣本1,採取了action=2,真實的q值爲-2
- 更新q-target中的值
action_0 | action_1 | action_2 |
---|---|---|
-1 | 2 | 1 |
2 | 3 | -2 |
4. 利用更新後的q-target與q-eval之間的差值進行訓練
仿真過程
def run_maze():
# 遊戲的每一個回合需要的步數
step = 0
# 遊戲的回合
for episode in range(300):
# 初始化觀察值
observation = env.reset()
while True:
# 開始環境仿真
env.render()
# 選擇動作
action = RL.choose_action(observation)
# 加入動作後,環境進行仿真
# 獲取了執行action後,下一步的觀測值observation
# 獲取了獎勵reward
# 遊戲是否結束標誌done
observation_, reward, done = env.step(action)
# 存儲樣本
RL.store_transition(observation, action, reward, observation_)
if (step > 200) and (step % 5 == 0):
# 隨機抽取樣本,網絡進行學習
RL.learn()
# 交換觀測值
observation = observation_
# 判斷遊戲是否結束
if done:
break
step += 1