强化学习-Q-learing算法原理与实现

Q-learing 算法思想

21世纪20年代的第一个春节快到了,给大家拜个早年,祝大家春节快乐。虽然对已经没有寒假的我来说,过年的期盼没有之前那么大,但是还是有所期待的,因为还有那么一丢丢年终奖值得期待。在一年的工作中,有过奋斗,有过彷徨,有过摸鱼,这一切都会在年终有所体现。这一年经过努力,经过懒惰变换了很多的状态。在这一系列的状态转换后,我也是很希望年终有一个好的结果。假如这一年进行时间离散化,每一个时间做出一个动作,进行状态转换,都会有相应的价值相应。那么我希望所有转换的总和价值最高。这就是强化学习的Q-learning。

设有状态集合SS和动作状态集合AA,在状态sts_t采用动作ata_t会换到一个新的状态st+1s_{t+1}获得奖罚rt+1r_{t+1}.假设一年的某个时段,我处在摸鱼的阶段,然后采取了加班努力工作的工作,老板看到后给我加薪。或者我努力工作状态时,突然心情不好,消极怠工,一下变成摸鱼状态,老板看到后给我降薪。这就是Q-learning的状态动作转换意义。自然地,我希望一年的工作获得最大的收获。一年的收获可以表示为t=1Trt\sum_{t=1}^T r_t.

当处在某个状态时,根据贝尔曼最优化原理,经过(i,j)的从(i0,j0)到(if,jf)的最优路径是一条由原点(0,0)到结点(i,j)到终点(if,jf)的最优路径的串联。状态转移可以用递归来表示。
Qnext=Qnow(st,at)+bβQ_{next}=Q_{now}(s_t,a_t)+b\betaβ=rt+1+γmaxQ(st+1,a)\beta=r_{t+1}+\gamma maxQ(s_{t+1},a)
下面我们对β\beta的表达式实现采用一个例子说明。

在这里插入图片描述
考虑一个房间如上图,5为最终的目的地。用图的结果表示如下
在这里插入图片描述
各个房间的转换的reward矩阵如下表示:
在这里插入图片描述
则价值函数的表达式记为1.1
R(state, action) + Gamma * Max[Q(next state, all actions)]

Q-learing算法的python实现

# -*- coding: utf-8 -*-
# @Time    : 2020/1/10 9:35
# @Author  : HelloWorld!
# @FileName: q_room.py
# @Software: PyCharm
# @Operating System: Windows 10
# @Python.version: 3.6

import numpy as np
import random

# 初始化矩阵
Q = np.zeros((6, 6))
Q = np.matrix(Q)

# 回报矩阵R

R = np.matrix([[-1, -1, -1, -1, 0, -1], [-1, -1, -1, 0, -1, 100], [-1, -1, -1, 0, -1, -1], [-1, 0, 0, -1, 0, -1],
               [0, -1, -1, 0, -1, 100], [-1, 0, -1, -1, 0, 100]])

# 设立学习参数
γ = 0.8
# 训练

for i in range(2000):

    # 对每一个训练,随机选择一种状态

    state = random.randint(0, 5)

    while True:

        # 选择当前状态下的所有可能动作

        r_pos_action = []

        for action in range(6):

            if R[state, action] >= 0:
                r_pos_action.append(action)

        next_state = r_pos_action[random.randint(0, len(r_pos_action) - 1)]

        Q[state, next_state] = R[state, next_state] + γ * (Q[next_state]).max()  # 更新

        state = next_state
        if i%100==0:
            print('---------------------',i)
            print(Q)
        # 状态4位最优库存状态

        if state == 5:
            break

print(Q)

结论

Q-learning 是强化学习的最基本的算法。当动作有限,状态很多的时候,可以用深度神经网络来才设计价值函数。因此主要映射好了状态,动作,价值函数就可以采用强化学习来解决问题。
附一个随机探索从某点到目的地的路径选择例子。参考https://blog.csdn.net/shankezh/article/details/102864085

需要找三个png图片,命名为boom.png ,diamond.png ,player.png 放在Q_learing.py和 Q_test.py的同级目录之下。

# -*- coding: utf-8 -*-
# @Time    : 2020/1/8 14:34
# @Author  : HelloWorld!
# @FileName: Q_learing.py
# @Software: PyCharm
# @Operating System: Windows 10
# @Python.version: 3.6


import tkinter as tk

from PIL import ImageTk

from PIL import Image

import time


class Env:

    def __init__(self):

        self.grid_size = 100

        self.win = tk.Tk()

        self.pic_player, self.pic_diamond, self.pic_boom1, self.pic_boom2, self.pic_boom3, self.pic_boom4 = self.__load_img()

        self.__init_win()

        self.canvas = self.__init_rc()

        self.texts = self.__produce_text()

        self.canvas.pack()

        # self._init_test_case()

        # self.win.mainloop()

    def __init_win(self):

        self.win.title('Grid World')

        # self.win.geometry("500x300")

    def __init_rc(self):

        canvas = tk.Canvas(self.win, width=500, height=720, bg='white')

        for h in range(5):

            for v in range(5):
                canvas.create_rectangle(self.grid_size * v, self.grid_size * h, self.grid_size * (v + 1),
                                        self.grid_size * (h + 1))

        trans_pixel = int(self.grid_size / 2)

        self.player = canvas.create_image(trans_pixel + self.grid_size * 0, trans_pixel + self.grid_size * 0,
                                          image=self.pic_player)

        self.diamond = canvas.create_image(trans_pixel + self.grid_size * 4, trans_pixel + self.grid_size * 4,
                                           image=self.pic_diamond)

        self.boom1 = canvas.create_image(trans_pixel + self.grid_size * 1, trans_pixel + self.grid_size * 1,
                                         image=self.pic_boom1)

        self.boom2 = canvas.create_image(trans_pixel + self.grid_size * 3, trans_pixel + self.grid_size * 1,
                                         image=self.pic_boom2)

        self.boom3 = canvas.create_image(trans_pixel + self.grid_size * 1, trans_pixel + self.grid_size * 3,
                                         image=self.pic_boom3)

        self.boom4 = canvas.create_image(trans_pixel + self.grid_size * 3, trans_pixel + self.grid_size * 3,
                                         image=self.pic_boom4)

        return canvas

    def __load_img(self):

        pic_resize = int(self.grid_size / 2)

        player = ImageTk.PhotoImage(Image.open("player.png").resize((pic_resize, pic_resize)))

        diamond = ImageTk.PhotoImage(Image.open("diamond.png").resize((pic_resize, pic_resize)))

        boom1 = ImageTk.PhotoImage(Image.open('boom.png').resize((pic_resize, pic_resize)))

        boom2 = ImageTk.PhotoImage(Image.open('boom.png').resize((pic_resize, pic_resize)))

        boom3 = ImageTk.PhotoImage(Image.open('boom.png').resize((pic_resize, pic_resize)))

        boom4 = ImageTk.PhotoImage(Image.open('boom.png').resize((pic_resize, pic_resize)))

        return player, diamond, boom1, boom2, boom3, boom4

    def __produce_text(self):

        texts = []

        x = self.grid_size / 2

        y = self.grid_size / 6

        for h in range(5):

            for v in range(5):
                up = self.canvas.create_text(x + h * self.grid_size, y + v * self.grid_size, text=0)

                down = self.canvas.create_text(x + h * self.grid_size, self.grid_size - y + v * self.grid_size, text=0)

                left = self.canvas.create_text(y + h * self.grid_size, x + v * self.grid_size, text=0)

                right = self.canvas.create_text(self.grid_size - y + h * self.grid_size, x + v * self.grid_size, text=0)

                texts.append({"up": up, "down": down, "left": left, "right": right})

        return texts

    def _win_d_update(self):

        self.win.update()

        time.sleep(0.1)


class GridWorld(Env):

    def __init__(self):

        super().__init__()

        self._win_d_update()

    def player_move(self, x, y):

        # x横向移动向右,y纵向移动向下

        self.canvas.move(self.player, x * self.grid_size, y * self.grid_size)

        self._win_d_update()

    def reset(self):

        # 重置为起始位置

        x, y = self.canvas.coords(self.player)

        self.canvas.move(self.player, -x + self.grid_size / 2, -y + self.grid_size / 2)

        self._win_d_update()

        return self.get_state(self.player)

    def get_state(self, who):

        x, y = self.canvas.coords(who)

        state = [int(x / self.grid_size), int(y / self.grid_size)]

        return state

    def update_val(self, num, arrow, val):

        pos = num[0] * 5 + num[1]

        x, y = self.canvas.coords(self.texts[pos][arrow])

        self.canvas.delete(self.texts[pos][arrow])

        self.texts[pos][arrow] = self.canvas.create_text(x, y, text=val)

        # self._win_d_update()

    def exec_calc(self, action):

        # 执行一次决策

        feedback = 'alive'  # alive, stop, dead 分别对应通过,撞墙,炸死

        next_state = []

        next_h, next_v, reward = 0.0, 0.0, 0.0

        h, v = self.get_state(self.player)

        if action == 0:  # up

            next_h = h

            next_v = v - 1

            # self.player_move(0, -1)

        elif action == 1:  # down

            next_h = h

            next_v = v + 1

            # self.player_move(0, 1)

        elif action == 2:  # left

            next_h = h - 1

            next_v = v

            # self.player_move(-1, 0)

        elif action == 3:  # right

            next_h = h + 1

            next_v = v

            # self.player_move(1, 0)

        else:

            print('programmer bug ...')

        next_state = [next_h, next_v]

        boom1, boom2, boom3, boom4 = self.get_state(self.boom1), self.get_state(self.boom2), self.get_state(

            self.boom3), self.get_state(self.boom4)

        diamond = self.get_state(self.diamond)

        if next_h < 0 or next_v < 0 or next_h > 4 or next_v > 4:  # 超过边界

            reward = -1

            feedback = 'stop'

        elif next_state == boom1 or next_state == boom2 or next_state == boom3 or next_state == boom4:  # 炸弹区域

            reward = -100

            feedback = 'dead'

        elif next_state == diamond:  # 获得的通关物品

            reward = 500

        else:

            reward = 0

        return feedback, next_state, reward

    def update_view(self, state, action, next_state, q_val):

        action_list = ['up', 'down', 'left', 'right']

        self.player_move(next_state[0] - state[0], next_state[1] - state[1])

        self.update_val(state, action_list[action], round(q_val, 2))

    def attach(self):

        # 到达终点,返回True , 未到达,返回False

        return str(self.get_state(self.player)) == str(self.get_state(self.diamond))

# -*- coding: utf-8 -*-
# @Time    : 2020/1/8 14:35
# @Author  : HelloWorld!
# @FileName: Q_test.py
# @Software: PyCharm
# @Operating System: Windows 10
# @Python.version: 3.6


import numpy as np

import Q_learing


class Agent:

    def __init__(self):

        self.actions = [0, 1, 2, 3]  # up down left right

        self.q_table = dict()

        self.__init_q_table()

        self.epsilon = 0.1

        self.learning_rate = 0.1

        self.gamma = 0.8

        # print(self.q_table)

    def __init_q_table(self):

        for v in range(5):

            for h in range(5):
                self.q_table[str([h, v])] = [0.0, 0.0, 0.0, 0.0]

    def get_action(self, state):

        # 根据状态选取下一个动作,但不对无法通过的区域进行选取

        action_list = self.q_table[str(state)]

        pass_action_index = []

        for index, val in enumerate(action_list):

            if val >= 0:
                pass_action_index.append(index)

        # 使用epsilon greedy来进行动作选取

        if np.random.rand() <= self.epsilon:

            # 进行探索

            return np.random.choice(pass_action_index)

        else:

            # 直接选取q最大值

            max_val = action_list[pass_action_index[0]]

            max_list = []

            for i in pass_action_index:

                # 最大值相同且不止一个则随机选个最大值

                if max_val < action_list[i]:

                    max_list.clear()

                    max_val = action_list[i]

                    max_list.append(i)

                elif max_val == action_list[i]:

                    max_list.append(i)

            return np.random.choice(max_list)

    def update_q_table(self, feedback, state, action, reward, next_state):

        # Q(s,a) = Q(s,a) + lr * { reward + gamma * max[Q(s`,a`)] - Q(s,a) }

        q_s_a = self.q_table[str(state)][action]  # 取出对应当前状态动作的q值

        if feedback == 'stop':

            q_ns_a = 0  # 撞墙时不存在下一状态,属于原地不变

        else:

            q_ns_a = np.max(self.q_table[str(next_state)])

        # 贝尔曼方程更新

        # self.q_table[str(state)][action] = q_s_a + self.learning_rate * (

        #     reward + self.gamma * q_ns_a - q_s_a

        # )

        self.q_table[str(state)][action] = (1 - self.learning_rate) * q_s_a + self.learning_rate * (
                    reward + self.gamma * q_ns_a)

        # print(self.q_table)

        return self.q_table[str(state)][action]


if __name__ == '__main__':

    np.random.seed(0)

    env = Q_learing.GridWorld()

    agent = Agent()

    for ep in range(2000):

        if ep < 100:

            agent.epsilon = 0.2

        else:

            agent.epsilon = 0.1

        state = env.reset()

        print('第{}轮训练开始 ... '.format(ep + 1))

        while not env.attach():

            action = agent.get_action(state)  # 产生动作

            # print(action)

            feedback, next_state, reward = env.exec_calc(action)  # 计算状态

            q_val = agent.update_q_table(feedback, state, action, reward, next_state)  # 更新Q表

            if feedback == 'stop':

                env.update_view(state, action, state, q_val)

                continue

            elif feedback == 'dead':

                env.update_view(state, action, next_state, q_val)

                break

            else:

                env.update_view(state, action, next_state, q_val)

            state = next_state  # 状态改变

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章