Deep Q-Learning深度增強學習(代碼篇)

搭建DQN

初始化

#動作數量
self.n_actions 
#狀態數量
self.n_features
#learning_rate學習速率
self.lr
#Q-learning中reward衰減因子
self.gamma
#e-greedy的選擇概率最大值
self.epsilon_max 
#更新Q現實網絡參數的步驟數
self.replace_target_iter
#存儲記憶的數量
self.memory_size
#每次從記憶庫中取的樣本數量
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
#學習的步驟
self.learn_step_counter
#記憶庫,此刻的n_feature + 下一步的n_feature + reward + action
self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

#利用Q目標的參數替換Q估計中的參數
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
#生成了一個tensorflow操作列表[tf.assign(t1,e1), tf.assign(t2,e2), tf.assign(t3,e3)]
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

構建神經網絡

構造Q估計神經網絡

def _build_net(self):
    #輸入
    self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')
    #Q現實輸入
    self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')

    with tf.variable_scope('eval_net'):
        #collection
        c_names = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES] 
        #神經元數量
        n_l1 =  10
        #權值
        w_initializer = tf.random_normal_initializer(0., 0.3)
        #偏置
        b_initializer = tf.constant_initializer(0.1)

        #第一層神經元        
        with tf.variable_scope('l1'):
            w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
            b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
            l1 = tf.nn.relu(tf.matmul(self.s, w1) + b1)
        #第二層神經元
        with tf.variable_scope('l2'):
            w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
            b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
            self.q_eval = tf.matmul(l1, w2) + b2

        #基於Q估計與Q現實,構造loss-function
        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))

        #訓練
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

構造Q現實神經網絡(該段代碼緊接着上段,屬於_build_net()函數)

    #輸入
    self.s_sub = tf.placeholder(tf.float32, [None, self.n_features], name='s_sub')    
    with tf.variable_scope('target_net'):
        #collection
        c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

        #第一層神經元
        with tf.variable_scope('l1'):
            w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
            b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
            l1 = tf.nn.relu(tf.matmul(self.s_, w1) + b1)

        #第二層神經元
        with tf.variable_scope('l2'):
            w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
            b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
            self.q_next = tf.matmul(l1, w2) + b2

存儲狀態信息

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0

        #狀態信息list ==> [x, y]
        #[action, reward]動作與獎勵信息合併爲list
        #下一步狀態信息 ==> [x_next, y_next]
        transition = np.hstack((s, [a, r], s_))
        #hstack的結果爲 ==> [x, y, a, r, x_next, y_next]

        #每過memory_size,替換存儲值
        index = self.memory_counter % self.memory_size

        #memory爲二維列表,transition爲一行向量,插入index行中
        self.memory[index, :] = transition
        self.memory_counter += 1

選擇動作action

    def choose_action(self, observation):
        # 將observation的list[x, y]轉爲行向量[[x, y]]
        observation = observation[np.newaxis, :]

        if np.random.uniform() < self.epsilon:
            # 得到每個action的q的估計值
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            # 選擇q值最大的action
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

增強學習過程

    def learn(self):
        #更換參數
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)

        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)

        #從memory中抽取一個記憶值,一個行向量
        #[x, y, a, r, x_next, y_next]
        batch_memory = self.memory[sample_index, :]

        q_next, q_eval = self.sess.run(
         [self.q_next, self.q_eval],
            feed_dict={
             self.s_: batch_memory[:, -self.n_features:],  # fixed params
             self.s: batch_memory[:, :self.n_features],  # newest params
            })

        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        reward = batch_memory[:, self.n_features + 1]

        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        #訓練網絡
        _, self.cost = self.sess.run([self._train_op, self.loss],
                                     feed_dict={self.s: batch_memory[:, :self.n_features],
                                                self.q_target: q_target})
        self.cost_his.append(self.cost)

        # increasing epsilon
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

舉例說明上述過程

數據結構

  • action=3
  • n_feature=2
  • batch_size=2

q-eval結構

action_0 action_1 action_2
1 2 1
2 3 2

行:每一個樣本
列:每一個action對應的Q值

q-next,q-target與q-eval結構相同

batch-index樣本索引

一維list ==> [0, 1] #長度:bactch_size

eval_act_index每個樣本對應的action的值,也就是每個樣本列的索引

一維list ==> [1, 0]

reward每個樣本對應的reward的值

一維list ==> [1, 2]


過程

  1. 將q-eval的值賦給q-target
  2. 利用Q-learning算法,計算每一個樣本的對應action的q值
    • 樣本0,採取了action=0,真實的q值爲-1
    • 樣本1,採取了action=2,真實的q值爲-2
  3. 更新q-target中的值
action_0 action_1 action_2
-1 2 1
2 3 -2

4. 利用更新後的q-target與q-eval之間的差值進行訓練


仿真過程

def run_maze():
    # 遊戲的每一個回合需要的步數
    step = 0
    # 遊戲的回合
    for episode in range(300):
        # 初始化觀察值
        observation = env.reset()

        while True:
            # 開始環境仿真
            env.render()

            # 選擇動作
            action = RL.choose_action(observation)

            # 加入動作後,環境進行仿真
            # 獲取了執行action後,下一步的觀測值observation
            # 獲取了獎勵reward
            # 遊戲是否結束標誌done
            observation_, reward, done = env.step(action)

            # 存儲樣本
            RL.store_transition(observation, action, reward, observation_)

            if (step > 200) and (step % 5 == 0):
            # 隨機抽取樣本,網絡進行學習
                RL.learn()

            # 交換觀測值
            observation = observation_

            # 判斷遊戲是否結束
            if done:
                break

            step += 1
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章