Keras强化学习——FlappyBird

github地址：https://github.com/ielcome2017/FlappyBird.git

代码

数据生成

import numpy as np
import sys
import cv2
import random

from play import Game, GameOver

IMAGE_SHAPE = (80, 80)


def convert(img):
    img = cv2.cvtColor(cv2.resize(img, IMAGE_SHAPE), cv2.COLOR_BGR2GRAY)
    ret, img = cv2.threshold(img, 1, 255, cv2.THRESH_BINARY)
    return np.array(img)


class Memory:
    def __init__(self):
        self.time_step = 4
        self.max_length = 50000
        self.head, self.next = self.time_step, 0
        self.memory = np.empty(self.max_length,
                               dtype=[("image", np.float, IMAGE_SHAPE), ("art", np.float, [4])])

    def memory_append(self, image, art):
        self.memory["image"][self.next % self.max_length] = image
        self.memory["art"][self.next % self.max_length] = art
        self.next = self.next + 1
        self.head += 1 if self.next > self.max_length else 0


class GameMemory(Memory):
    def __init__(self, func, count, flag="explore"):
        self.count = count
        self.func = func
        self.flag = flag

        self.explore = 3000000
        self.observer = 10000

        self.image_shape = (80, 80)
        self.pre_step_epoch = 10000
        super().__init__()

    def show(self):
        for _ in self.next_data():
            yield _

    def next_data(self):
        # 参数设置
        epsilon = 0.001 if self.flag in ["explore", "display"] else 0.1
        init_epsilon, final_epsilon = 0.1, 0.001
        action_dim = 2
        # 初始化
        num = 40 if self.flag in ["explore", "train"] else 1
        game = Game(num)    # game为环境

        action = np.array([1, 0])
        image, reward, terminal = game.frame_step(action)

        image = convert(image)
        for _ in range(4):
            self.memory_append(image, [*action, reward, terminal])
        epsilon -= (init_epsilon - final_epsilon) / self.explore * self.count * self.pre_step_epoch
        epsilon = np.clip(epsilon, a_max=init_epsilon, a_min=final_epsilon)

        # 获取当前状态
        count = self.count * self.pre_step_epoch
        try:
            while True:
                # 获取动作
                if random.random() < epsilon:
                    action_ind = np.random.randint(0, action_dim)
                else:
                    idx = (self.next - np.arange(1, self.time_step+1)) % self.max_length
                    state = self.memory["image"][idx]

                    state = np.transpose(state[np.newaxis, :, :], [0, 2, 3, 1])
                    action_ind = self.func(state).argmax(-1).astype("int")[0]   # 智能体产生动作

                epsilon -= (init_epsilon - final_epsilon) / self.explore
                epsilon = np.clip(epsilon, a_max=init_epsilon, a_min=final_epsilon)
                count += 1

                action = game.get_event(action_ind)     # 游戏中事件触发

                image, reward, terminal = game.frame_step(action)   # 环境的激励
                image = convert(image)  # 80*80

                self.memory_append(image, [*action, reward, terminal])
                data = self.batch_data()
                if data is not None:
                    yield data

        except GameOver:
            print("\n{}> game close <{}".format("="*10, "="*10))

    def batch_data(self, batch_size=32):
        gamma = 0.99
        num_sample = self.next - self.head
        if num_sample < self.observer:
            sys.stdout.write("\r num of sample is : %d/%d" % (num_sample, self.observer))
            sys.stdout.flush()
            return None
        batch_ind = np.random.choice(np.arange(self.head, self.next), [batch_size]) % self.max_length
        # 抽取数据训练
        image_ind = (batch_ind[:, np.newaxis] - np.arange(self.time_step)) % self.max_length
        # 当前步为预测动作产生的状态，要用上一个状态的reward与当前预测的y比较
        # 如当前批次索引为[1, 3, 5, 7...] 取第一个索引，那么当前状态为[0, 1, 2, 3], next_state为[1, 2, 3, 4]
        current_state = np.transpose(self.memory["image"][(image_ind - 1) % self.max_length], [0, 2, 3, 1])
        next_state = np.transpose(self.memory["image"][image_ind], [0, 2, 3, 1])
        art = self.memory["art"][batch_ind]
        action, reward, terminal = art[:, 0:2], art[:, -2], art[:, -1]
        out = self.func(next_state).max(-1)
        batch_y = reward + gamma * out * (1 - terminal)
        return [current_state, action], batch_y

网络

import keras
from keras.layers import Dense, Conv2D, MaxPool2D, \
    Input, Flatten, Dot, Activation


def NetV1():
    state_shape, action_dim = [80, 80, 4], 2
    actions = Input([action_dim])
    state = Input(state_shape)

    x = Conv2D(32, kernel_size=8, strides=4, padding="same")(state)
    x = Activation("relu")(x)
    x = MaxPool2D(pool_size=2)(x)

    x = Conv2D(64, kernel_size=4, strides=2, padding="same")(x)
    x = Activation('relu')(x)

    x = Conv2D(64, kernel_size=3, strides=1, padding="same")(x)
    x = Activation('relu')(x)

    x = Flatten()(x)
    x = Dense(512)(x)
    x = Activation('relu')(x)

    out1 = Dense(action_dim)(x)

    out2 = Dot(-1)([actions, out1])
    model = keras.Model([state, actions], out2)
    optimizer = keras.optimizers.Adam(1e-6)
    # optimizer = keras.optimizers.SGD(lr=1e-5, decay=1e-6, momentum=0.9, nesterov=True)
    loss = keras.losses.mse
    model.compile(optimizer=optimizer, loss=loss)
    model.summary()
    return model

训练

from agent import GameMemory
import keras
from net import NetV1, NetV2
import os

EPOCHS = 400
STEPS_PER_EPOCH = 10000
FLAG = "train"


def get_net(net_version):
    train_net, path = (NetV1(), "NETV1/") if net_version == 0 else (NetV2(), "NETV2/")
    call_function = [
        keras.callbacks.ModelCheckpoint(filepath=path + "weight.{epoch:02d}.h5")]

    if len(os.listdir(path)) == 0:
        return train_net, call_function, 0
    counts = [int(file.split(".")[1]) for file in os.listdir(path)]
    count = max(counts)
    filename = path + "weight.%02d.h5" % count
    train_net.load_weights(filename)
    return train_net, call_function, count


def train():
    net, call_function, count = get_net(net_version=0)
    agent = keras.backend.function(net.input[0], net.layers[-2].output)
    data = GameMemory(agent, count, flag=FLAG)    # flag in ["train", "explore", "display"] 训练过程随机动作较多
    net.fit(data.next_data(), epochs=EPOCHS, initial_epoch=data.count,
            steps_per_epoch=STEPS_PER_EPOCH, callbacks=call_function)


if __name__ == '__main__':
    train()

模型下载
下载完成后，将里面的内容解压到当前项目的根目录FlappyBird下面

FlappyBird
|–model
|–|-- bird-dqn-2920000
|–|-- weight.292.h5
|–NETV1
|–|-- weight.200.h5

运行过程

main.py中train()函数定义好网络和回调函数。

变量	参数
网络	net
预测函数	func = Model(net.input[0], net.out1)

强化学习有5个元素：环境，代理，状态，激励函数(reward)，动作。

代理产生动作，环境根据动作反馈该动作之后的环境状态以及reward。需要注意该State知识环境反馈的一个画面，数据根据游戏的画面而生成，4帧为一个游戏状态，为current_state, next_state。

环境会根据游戏状态，用激励函数给出激励，但是环境的激励通常只有结束(-1)、胜利(1)和正在进行(0.1)三种

强化网络的目标是训练代理，这个代理也叫智能体，训练过程的数据格式如下:

当前游戏状态current_state，
当前游戏动作(action)，
(Q_value)下个游戏状态(next_sate)理想中的

代理的训练过程实质为代理激励函数的拟合过程。FlappyBird中，飞过中心线才获得为1的激励，但是靠近中心线以及在管道外面飞的激励均为0.1，这对于环境是正常的，如下图。

但是对于人的大脑两者的价值绝不等同，靠近中心线的状态(下图中的2)人们会在下意识觉得鸟更棒，需要更高的激励值。但是初始化的网络不能够正常反馈这种激励函数，因此训练代理就是拟合这种激励函数，使之接近于人的想法。

Q_value为net的out1

Q_value = max(func.predict(next_state))

那么代理觉得Bird在游戏状态中真正的激励为：

agent_reward = reward + Q_value * (1-terminal)

terminal表示Bird是否翻车为1就是翻车了

代理根据游戏的current_sate和action根据自己拟合出来的激励函数，给出的激励值为current_reward为net的out2.

current_reward = net.predict([current_state, action])

最终优化current_reward与agent_reward的差，两者越小越好。

为了加快数据产生速度，开始将游戏的帧率提高，正常的fps为30，训练时调整为1000，因为游戏每一帧过去，就会训练一次，如果帧数过慢，网络训练就会陷入等待。调整后训练每一批次一万条记录，训练事件为2分50秒左右，200轮训练将近9.5到10个小时之间。

292轮后的效果移步

文件结构

net.py 网络结构包含两个网络V1和V2, 目前只训练V1 。

文件名	描述
main.py	训练文件
agent.py	产生数据
play.py	游戏运行包
game\control.py	游戏调度文件
game\element.py	游戏配置文件
game\engine.py	游戏后端，两种选择PyQt5和pygame
game\display.py	PyQt5写的游戏引擎

网络结构 net.py

	Layer (type)	Input Shape	Kernel	stride	Output Shape
0	input_state				(None, 80, 80, 4)
1	input_action				(None, 2)
2	卷积(conv)	(None, 80, 80, 4)	(8, 8, 4, 32)	4	(None, 20, 20, 32)
3	池化(pool)	(None, 20, 20, 32)		2	(None, 10, 10, 32)
4	卷积(conv)	(None, 20, 20, 32)	(4, 4, 32, 64)	2	(None, 5, 5, 64)
5	卷积(conv)	(None, 5, 5, 64)	(3, 3, 64, 64)	1	(None, 5, 5, 64)
6	flatten				(None, 1600)
7	全连接(fully_connect)	1600	(1600, 512)		(None, 512)
8	out1_全连接(fully_connect)	512	(512, 2)		(None, 2)
9	out2_点乘(dot)	(None, 2)[action, out1]			(None, 1)

Keras强化学习——FlappyBird

代码

数据生成

网络

训练

运行过程

文件结构

网络结构 net.py

「Pygors跨平台GUI」2：安装MinGW-w64、MSYS2还是WSL2

一键自动化博客发布工具,用过的人都说好(掘金篇)

[转帖]

python列出centos7内存使用前50的进程信息

「Pygors跨平台GUI」1：Pygors跨平台GUI应用研究

评估统计算法在银行伪造钞票检测中的价值

Java ThreadPoolShutdown

5月21日相聚上海张江！与文心大模型一起共建大模型产业应用生态圈

通义千问 2.5 “客串” ChatGPT4，你分的清吗？

“她”来了，陪伴赛道巨变！为GPT-4o加上你的一个数字分身

SVM算法檢測XSS注入

Keras強化學習——FlappyBird

numpy矩陣轉化QPixmap

決策樹CART分類算法

Keras強化學習——Mountains Car

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結