Keras強化學習——FlappyBird

github地址：https://github.com/ielcome2017/FlappyBird.git

代碼

數據生成

import numpy as np
import sys
import cv2
import random

from play import Game, GameOver

IMAGE_SHAPE = (80, 80)


def convert(img):
    img = cv2.cvtColor(cv2.resize(img, IMAGE_SHAPE), cv2.COLOR_BGR2GRAY)
    ret, img = cv2.threshold(img, 1, 255, cv2.THRESH_BINARY)
    return np.array(img)


class Memory:
    def __init__(self):
        self.time_step = 4
        self.max_length = 50000
        self.head, self.next = self.time_step, 0
        self.memory = np.empty(self.max_length,
                               dtype=[("image", np.float, IMAGE_SHAPE), ("art", np.float, [4])])

    def memory_append(self, image, art):
        self.memory["image"][self.next % self.max_length] = image
        self.memory["art"][self.next % self.max_length] = art
        self.next = self.next + 1
        self.head += 1 if self.next > self.max_length else 0


class GameMemory(Memory):
    def __init__(self, func, count, flag="explore"):
        self.count = count
        self.func = func
        self.flag = flag

        self.explore = 3000000
        self.observer = 10000

        self.image_shape = (80, 80)
        self.pre_step_epoch = 10000
        super().__init__()

    def show(self):
        for _ in self.next_data():
            yield _

    def next_data(self):
        # 參數設置
        epsilon = 0.001 if self.flag in ["explore", "display"] else 0.1
        init_epsilon, final_epsilon = 0.1, 0.001
        action_dim = 2
        # 初始化
        num = 40 if self.flag in ["explore", "train"] else 1
        game = Game(num)    # game爲環境

        action = np.array([1, 0])
        image, reward, terminal = game.frame_step(action)

        image = convert(image)
        for _ in range(4):
            self.memory_append(image, [*action, reward, terminal])
        epsilon -= (init_epsilon - final_epsilon) / self.explore * self.count * self.pre_step_epoch
        epsilon = np.clip(epsilon, a_max=init_epsilon, a_min=final_epsilon)

        # 獲取當前狀態
        count = self.count * self.pre_step_epoch
        try:
            while True:
                # 獲取動作
                if random.random() < epsilon:
                    action_ind = np.random.randint(0, action_dim)
                else:
                    idx = (self.next - np.arange(1, self.time_step+1)) % self.max_length
                    state = self.memory["image"][idx]

                    state = np.transpose(state[np.newaxis, :, :], [0, 2, 3, 1])
                    action_ind = self.func(state).argmax(-1).astype("int")[0]   # 智能體產生動作

                epsilon -= (init_epsilon - final_epsilon) / self.explore
                epsilon = np.clip(epsilon, a_max=init_epsilon, a_min=final_epsilon)
                count += 1

                action = game.get_event(action_ind)     # 遊戲中事件觸發

                image, reward, terminal = game.frame_step(action)   # 環境的激勵
                image = convert(image)  # 80*80

                self.memory_append(image, [*action, reward, terminal])
                data = self.batch_data()
                if data is not None:
                    yield data

        except GameOver:
            print("\n{}> game close <{}".format("="*10, "="*10))

    def batch_data(self, batch_size=32):
        gamma = 0.99
        num_sample = self.next - self.head
        if num_sample < self.observer:
            sys.stdout.write("\r num of sample is : %d/%d" % (num_sample, self.observer))
            sys.stdout.flush()
            return None
        batch_ind = np.random.choice(np.arange(self.head, self.next), [batch_size]) % self.max_length
        # 抽取數據訓練
        image_ind = (batch_ind[:, np.newaxis] - np.arange(self.time_step)) % self.max_length
        # 當前步爲預測動作產生的狀態，要用上一個狀態的reward與當前預測的y比較
        # 如當前批次索引爲[1, 3, 5, 7...] 取第一個索引，那麼當前狀態爲[0, 1, 2, 3], next_state爲[1, 2, 3, 4]
        current_state = np.transpose(self.memory["image"][(image_ind - 1) % self.max_length], [0, 2, 3, 1])
        next_state = np.transpose(self.memory["image"][image_ind], [0, 2, 3, 1])
        art = self.memory["art"][batch_ind]
        action, reward, terminal = art[:, 0:2], art[:, -2], art[:, -1]
        out = self.func(next_state).max(-1)
        batch_y = reward + gamma * out * (1 - terminal)
        return [current_state, action], batch_y

網絡

import keras
from keras.layers import Dense, Conv2D, MaxPool2D, \
    Input, Flatten, Dot, Activation


def NetV1():
    state_shape, action_dim = [80, 80, 4], 2
    actions = Input([action_dim])
    state = Input(state_shape)

    x = Conv2D(32, kernel_size=8, strides=4, padding="same")(state)
    x = Activation("relu")(x)
    x = MaxPool2D(pool_size=2)(x)

    x = Conv2D(64, kernel_size=4, strides=2, padding="same")(x)
    x = Activation('relu')(x)

    x = Conv2D(64, kernel_size=3, strides=1, padding="same")(x)
    x = Activation('relu')(x)

    x = Flatten()(x)
    x = Dense(512)(x)
    x = Activation('relu')(x)

    out1 = Dense(action_dim)(x)

    out2 = Dot(-1)([actions, out1])
    model = keras.Model([state, actions], out2)
    optimizer = keras.optimizers.Adam(1e-6)
    # optimizer = keras.optimizers.SGD(lr=1e-5, decay=1e-6, momentum=0.9, nesterov=True)
    loss = keras.losses.mse
    model.compile(optimizer=optimizer, loss=loss)
    model.summary()
    return model

訓練

from agent import GameMemory
import keras
from net import NetV1, NetV2
import os

EPOCHS = 400
STEPS_PER_EPOCH = 10000
FLAG = "train"


def get_net(net_version):
    train_net, path = (NetV1(), "NETV1/") if net_version == 0 else (NetV2(), "NETV2/")
    call_function = [
        keras.callbacks.ModelCheckpoint(filepath=path + "weight.{epoch:02d}.h5")]

    if len(os.listdir(path)) == 0:
        return train_net, call_function, 0
    counts = [int(file.split(".")[1]) for file in os.listdir(path)]
    count = max(counts)
    filename = path + "weight.%02d.h5" % count
    train_net.load_weights(filename)
    return train_net, call_function, count


def train():
    net, call_function, count = get_net(net_version=0)
    agent = keras.backend.function(net.input[0], net.layers[-2].output)
    data = GameMemory(agent, count, flag=FLAG)    # flag in ["train", "explore", "display"] 訓練過程隨機動作較多
    net.fit(data.next_data(), epochs=EPOCHS, initial_epoch=data.count,
            steps_per_epoch=STEPS_PER_EPOCH, callbacks=call_function)


if __name__ == '__main__':
    train()

模型下載
下載完成後，將裏面的內容解壓到當前項目的根目錄FlappyBird下面

FlappyBird
|–model
|–|-- bird-dqn-2920000
|–|-- weight.292.h5
|–NETV1
|–|-- weight.200.h5

運行過程

main.py中train()函數定義好網絡和回調函數。

變量	參數
網絡	net
預測函數	func = Model(net.input[0], net.out1)

強化學習有5個元素：環境，代理，狀態，激勵函數(reward)，動作。

代理產生動作，環境根據動作反饋該動作之後的環境狀態以及reward。需要注意該State知識環境反饋的一個畫面，數據根據遊戲的畫面而生成，4幀爲一個遊戲狀態，爲current_state, next_state。

環境會根據遊戲狀態，用激勵函數給出激勵，但是環境的激勵通常只有結束(-1)、勝利(1)和正在進行(0.1)三種

強化網絡的目標是訓練代理，這個代理也叫智能體，訓練過程的數據格式如下:

當前遊戲狀態current_state，
當前遊戲動作(action)，
(Q_value)下個遊戲狀態(next_sate)理想中的

代理的訓練過程實質爲代理激勵函數的擬合過程。FlappyBird中，飛過中心線才獲得爲1的激勵，但是靠近中心線以及在管道外面飛的激勵均爲0.1，這對於環境是正常的，如下圖。

但是對於人的大腦兩者的價值絕不等同，靠近中心線的狀態(下圖中的2)人們會在下意識覺得鳥更棒，需要更高的激勵值。但是初始化的網絡不能夠正常反饋這種激勵函數，因此訓練代理就是擬合這種激勵函數，使之接近於人的想法。

Q_value爲net的out1

Q_value = max(func.predict(next_state))

那麼代理覺得Bird在遊戲狀態中真正的激勵爲：

agent_reward = reward + Q_value * (1-terminal)

terminal表示Bird是否翻車爲1就是翻車了

代理根據遊戲的current_sate和action根據自己擬合出來的激勵函數，給出的激勵值爲current_reward爲net的out2.

current_reward = net.predict([current_state, action])

最終優化current_reward與agent_reward的差，兩者越小越好。

爲了加快數據產生速度，開始將遊戲的幀率提高，正常的fps爲30，訓練時調整爲1000，因爲遊戲每一幀過去，就會訓練一次，如果幀數過慢，網絡訓練就會陷入等待。調整後訓練每一批次一萬條記錄，訓練事件爲2分50秒左右，200輪訓練將近9.5到10個小時之間。

292輪後的效果移步

文件結構

net.py 網絡結構包含兩個網絡V1和V2, 目前只訓練V1 。

文件名	描述
main.py	訓練文件
agent.py	產生數據
play.py	遊戲運行包
game\control.py	遊戲調度文件
game\element.py	遊戲配置文件
game\engine.py	遊戲後端，兩種選擇PyQt5和pygame
game\display.py	PyQt5寫的遊戲引擎

網絡結構 net.py

	Layer (type)	Input Shape	Kernel	stride	Output Shape
0	input_state				(None, 80, 80, 4)
1	input_action				(None, 2)
2	卷積(conv)	(None, 80, 80, 4)	(8, 8, 4, 32)	4	(None, 20, 20, 32)
3	池化(pool)	(None, 20, 20, 32)		2	(None, 10, 10, 32)
4	卷積(conv)	(None, 20, 20, 32)	(4, 4, 32, 64)	2	(None, 5, 5, 64)
5	卷積(conv)	(None, 5, 5, 64)	(3, 3, 64, 64)	1	(None, 5, 5, 64)
6	flatten				(None, 1600)
7	全連接(fully_connect)	1600	(1600, 512)		(None, 512)
8	out1_全連接(fully_connect)	512	(512, 2)		(None, 2)
9	out2_點乘(dot)	(None, 2)[action, out1]			(None, 1)

Keras強化學習——FlappyBird

代碼

數據生成

網絡

訓練

運行過程

文件結構

網絡結構 net.py

.Net 8.0 下的新RPC，IceRPC之試試的新玩法"打洞"

關於遊戲付費的一點想法

我通過CKA和CKS啦！

《最新出爐》系列入門篇-Python+Playwright自動化測試-42-強大的可視化追蹤利器Trace Viewer

大數據怎麼學？對大數據開發領域及崗位的詳細解讀，完整理解大數據開發領域技術體系

SVM算法檢測XSS注入

Keras強化學習——FlappyBird

numpy矩陣轉化QPixmap

決策樹CART分類算法

Keras強化學習——Mountains Car

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結