Keras強化學習——FlappyBird

github地址:https://github.com/ielcome2017/FlappyBird.git

代碼

數據生成

import numpy as np
import sys
import cv2
import random

from play import Game, GameOver

IMAGE_SHAPE = (80, 80)


def convert(img):
    img = cv2.cvtColor(cv2.resize(img, IMAGE_SHAPE), cv2.COLOR_BGR2GRAY)
    ret, img = cv2.threshold(img, 1, 255, cv2.THRESH_BINARY)
    return np.array(img)


class Memory:
    def __init__(self):
        self.time_step = 4
        self.max_length = 50000
        self.head, self.next = self.time_step, 0
        self.memory = np.empty(self.max_length,
                               dtype=[("image", np.float, IMAGE_SHAPE), ("art", np.float, [4])])

    def memory_append(self, image, art):
        self.memory["image"][self.next % self.max_length] = image
        self.memory["art"][self.next % self.max_length] = art
        self.next = self.next + 1
        self.head += 1 if self.next > self.max_length else 0


class GameMemory(Memory):
    def __init__(self, func, count, flag="explore"):
        self.count = count
        self.func = func
        self.flag = flag

        self.explore = 3000000
        self.observer = 10000

        self.image_shape = (80, 80)
        self.pre_step_epoch = 10000
        super().__init__()

    def show(self):
        for _ in self.next_data():
            yield _

    def next_data(self):
        # 參數設置
        epsilon = 0.001 if self.flag in ["explore", "display"] else 0.1
        init_epsilon, final_epsilon = 0.1, 0.001
        action_dim = 2
        # 初始化
        num = 40 if self.flag in ["explore", "train"] else 1
        game = Game(num)    # game爲環境

        action = np.array([1, 0])
        image, reward, terminal = game.frame_step(action)

        image = convert(image)
        for _ in range(4):
            self.memory_append(image, [*action, reward, terminal])
        epsilon -= (init_epsilon - final_epsilon) / self.explore * self.count * self.pre_step_epoch
        epsilon = np.clip(epsilon, a_max=init_epsilon, a_min=final_epsilon)

        # 獲取當前狀態
        count = self.count * self.pre_step_epoch
        try:
            while True:
                # 獲取動作
                if random.random() < epsilon:
                    action_ind = np.random.randint(0, action_dim)
                else:
                    idx = (self.next - np.arange(1, self.time_step+1)) % self.max_length
                    state = self.memory["image"][idx]

                    state = np.transpose(state[np.newaxis, :, :], [0, 2, 3, 1])
                    action_ind = self.func(state).argmax(-1).astype("int")[0]   # 智能體產生動作

                epsilon -= (init_epsilon - final_epsilon) / self.explore
                epsilon = np.clip(epsilon, a_max=init_epsilon, a_min=final_epsilon)
                count += 1

                action = game.get_event(action_ind)     # 遊戲中事件觸發

                image, reward, terminal = game.frame_step(action)   # 環境的激勵
                image = convert(image)  # 80*80

                self.memory_append(image, [*action, reward, terminal])
                data = self.batch_data()
                if data is not None:
                    yield data

        except GameOver:
            print("\n{}> game close <{}".format("="*10, "="*10))

    def batch_data(self, batch_size=32):
        gamma = 0.99
        num_sample = self.next - self.head
        if num_sample < self.observer:
            sys.stdout.write("\r num of sample is : %d/%d" % (num_sample, self.observer))
            sys.stdout.flush()
            return None
        batch_ind = np.random.choice(np.arange(self.head, self.next), [batch_size]) % self.max_length
        # 抽取數據訓練
        image_ind = (batch_ind[:, np.newaxis] - np.arange(self.time_step)) % self.max_length
        # 當前步爲預測動作產生的狀態,要用上一個狀態的reward與當前預測的y比較
        # 如當前批次索引爲[1, 3, 5, 7...] 取第一個索引,那麼當前狀態爲[0, 1, 2, 3], next_state爲[1, 2, 3, 4]
        current_state = np.transpose(self.memory["image"][(image_ind - 1) % self.max_length], [0, 2, 3, 1])
        next_state = np.transpose(self.memory["image"][image_ind], [0, 2, 3, 1])
        art = self.memory["art"][batch_ind]
        action, reward, terminal = art[:, 0:2], art[:, -2], art[:, -1]
        out = self.func(next_state).max(-1)
        batch_y = reward + gamma * out * (1 - terminal)
        return [current_state, action], batch_y

網絡

import keras
from keras.layers import Dense, Conv2D, MaxPool2D, \
    Input, Flatten, Dot, Activation


def NetV1():
    state_shape, action_dim = [80, 80, 4], 2
    actions = Input([action_dim])
    state = Input(state_shape)

    x = Conv2D(32, kernel_size=8, strides=4, padding="same")(state)
    x = Activation("relu")(x)
    x = MaxPool2D(pool_size=2)(x)

    x = Conv2D(64, kernel_size=4, strides=2, padding="same")(x)
    x = Activation('relu')(x)

    x = Conv2D(64, kernel_size=3, strides=1, padding="same")(x)
    x = Activation('relu')(x)

    x = Flatten()(x)
    x = Dense(512)(x)
    x = Activation('relu')(x)

    out1 = Dense(action_dim)(x)

    out2 = Dot(-1)([actions, out1])
    model = keras.Model([state, actions], out2)
    optimizer = keras.optimizers.Adam(1e-6)
    # optimizer = keras.optimizers.SGD(lr=1e-5, decay=1e-6, momentum=0.9, nesterov=True)
    loss = keras.losses.mse
    model.compile(optimizer=optimizer, loss=loss)
    model.summary()
    return model

訓練

from agent import GameMemory
import keras
from net import NetV1, NetV2
import os

EPOCHS = 400
STEPS_PER_EPOCH = 10000
FLAG = "train"


def get_net(net_version):
    train_net, path = (NetV1(), "NETV1/") if net_version == 0 else (NetV2(), "NETV2/")
    call_function = [
        keras.callbacks.ModelCheckpoint(filepath=path + "weight.{epoch:02d}.h5")]

    if len(os.listdir(path)) == 0:
        return train_net, call_function, 0
    counts = [int(file.split(".")[1]) for file in os.listdir(path)]
    count = max(counts)
    filename = path + "weight.%02d.h5" % count
    train_net.load_weights(filename)
    return train_net, call_function, count


def train():
    net, call_function, count = get_net(net_version=0)
    agent = keras.backend.function(net.input[0], net.layers[-2].output)
    data = GameMemory(agent, count, flag=FLAG)    # flag in ["train", "explore", "display"] 訓練過程隨機動作較多
    net.fit(data.next_data(), epochs=EPOCHS, initial_epoch=data.count,
            steps_per_epoch=STEPS_PER_EPOCH, callbacks=call_function)


if __name__ == '__main__':
    train()

模型下載
下載完成後,將裏面的內容解壓到當前項目的根目錄FlappyBird下面

FlappyBird
|–model
|–|-- bird-dqn-2920000
|–|-- weight.292.h5
|–NETV1
|–|-- weight.200.h5

運行過程

main.py中train()函數定義好網絡和回調函數。

變量 參數
網絡 net
預測函數 func = Model(net.input[0], net.out1)

強化學習有5個元素:環境,代理,狀態,激勵函數(reward),動作。

代理產生動作,環境根據動作反饋該動作之後的環境狀態以及reward。需要注意該State知識環境反饋的一個畫面,數據根據遊戲的畫面而生成,4幀爲一個遊戲狀態,爲current_state, next_state。

環境會根據遊戲狀態,用激勵函數給出激勵,但是環境的激勵通常只有結束(-1)、勝利(1)和正在進行(0.1)三種

強化網絡的目標是訓練代理,這個代理也叫智能體,訓練過程的數據格式如下:

  1. 當前遊戲狀態current_state,

  2. 當前遊戲動作(action),

  3. (Q_value)下個遊戲狀態(next_sate)理想中的

代理的訓練過程實質爲代理激勵函數的擬合過程。FlappyBird中,飛過中心線才獲得爲1的激勵,但是靠近中心線以及在管道外面飛的激勵均爲0.1,這對於環境是正常的,如下圖。

但是對於人的大腦兩者的價值絕不等同,靠近中心線的狀態(下圖中的2)人們會在下意識覺得鳥更棒,需要更高的激勵值。但是初始化的網絡不能夠正常反饋這種激勵函數,因此訓練代理就是擬合這種激勵函數,使之接近於人的想法。

Q_value爲net的out1

Q_value = max(func.predict(next_state))

那麼代理覺得Bird在遊戲狀態中真正的激勵爲:

agent_reward = reward + Q_value * (1-terminal)

terminal表示Bird是否翻車 爲1就是翻車了

代理根據遊戲的current_sate和action根據自己擬合出來的激勵函數,給出的激勵值爲current_reward爲net的out2.

current_reward = net.predict([current_state, action])

最終優化current_reward與agent_reward的差,兩者越小越好。

爲了加快數據產生速度,開始將遊戲的幀率提高,正常的fps爲30,訓練時調整爲1000,因爲遊戲每一幀過去,就會訓練一次,如果幀數過慢,網絡訓練就會陷入等待。調整後訓練每一批次一萬條記錄,訓練事件爲2分50秒左右,200輪訓練將近9.5到10個小時之間。

292輪後的效果移步

文件結構

net.py 網絡結構包含兩個網絡V1和V2, 目前只訓練V1 。

文件名 描述
main.py 訓練文件
agent.py 產生數據
play.py 遊戲運行包
game\control.py 遊戲調度文件
game\element.py 遊戲配置文件
game\engine.py 遊戲後端,兩種選擇PyQt5和pygame
game\display.py PyQt5寫的遊戲引擎

網絡結構 net.py

Layer (type) Input Shape Kernel stride Output Shape
0 input_state (None, 80, 80, 4)
1 input_action (None, 2)
2 卷積(conv) (None, 80, 80, 4) (8, 8, 4, 32) 4 (None, 20, 20, 32)
3 池化(pool) (None, 20, 20, 32) 2 (None, 10, 10, 32)
4 卷積(conv) (None, 20, 20, 32) (4, 4, 32, 64) 2 (None, 5, 5, 64)
5 卷積(conv) (None, 5, 5, 64) (3, 3, 64, 64) 1 (None, 5, 5, 64)
6 flatten (None, 1600)
7 全連接(fully_connect) 1600 (1600, 512) (None, 512)
8 out1_全連接(fully_connect) 512 (512, 2) (None, 2)
9 out2_點乘(dot) (None, 2)[action, out1] (None, 1)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章