這裏,使用gym環境實現仿真,從小車倒立擺的環境模型中,我們不難看到,小車倒立擺的狀態空間爲共四個,動作空間爲爲兩個,當動作爲1時,施加正向的力10N;當動作爲0時,施加負向的力-10N。
在碼中使用了env.step()函數來對每一步進行仿真,在gym中,env.step()會返回 4 個參數(observation, reward, done, info)。
代碼如下:
# -*- coding: utf-8 -*-
import os
import random
import numpy as np
import gym
from collections import deque
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
import pandas as pd
# from DRL import DRL
import matplotlib.pyplot as plt
class DQN():
"""Deep Q-Learning.
"""
def __init__(self):
super(DQN, self).__init__()
self.model = self.build_model()
self.env = gym.make('CartPole-v0')
if not os.path.exists('model'):
os.mkdir('model')
if not os.path.exists('history'):
os.mkdir('history')
# experience replay.
self.memory_buffer = deque(maxlen=2000)
# discount rate for q value.
self.gamma = 0.95
# epsilon of ε-greedy.
self.epsilon = 1.0
# discount rate for epsilon.
self.epsilon_decay = 0.995
# min epsilon of ε-greedy.
self.epsilon_min = 0.01
def load(self):
if os.path.exists('model/dqn.h5'):
self.model.load_weights('model/dqn.h5')
def build_model(self):
"""basic model.
"""
inputs = Input(shape=(4,))
x = Dense(16, activation='relu')(inputs)
x = Dense(16, activation='relu')(x)
x = Dense(2, activation='linear')(x)
model = Model(inputs=inputs, outputs=x)
model.compile(loss='mse', optimizer=Adam(1e-3))
return model
def save_history(self, history, name):
name = os.path.join('history', name)
df = pd.DataFrame.from_dict(history)
df.to_csv(name, index=False, encoding='utf-8')
def play(self, m='pg'):
"""play game with model.
"""
print('play...')
observation = self.env.reset()
reward_sum = 0
random_episodes = 0
while random_episodes < 10:
self.env.render()
x = observation.reshape(-1, 4)
if m == 'pg':
prob = self.model.predict(x)[0][0]
action = 1 if prob > 0.5 else 0
elif m == 'acs':
prob = self.actor.predict(x)[0][0]
action = 1 if prob > 0.5 else 0
else:
action = np.argmax(self.model.predict(x)[0])
observation, reward, done, _ = self.env.step(action)
reward_sum += reward
if done:
print("Reward for this episode was: {}".format(reward_sum))
random_episodes += 1
reward_sum = 0
observation = self.env.reset()
self.env.close()
def egreedy_action(self, state):
"""ε-greedy
Arguments:
state: observation
Returns:
action: action
"""
if np.random.rand() <= self.epsilon:
return random.randint(0, 1)
else:
q_values = self.model.predict(state)[0]
return np.argmax(q_values)
def remember(self, state, action, reward, next_state, done):
"""add data to experience replay.
Arguments:
state: observation
action: action
reward: reward
next_state: next_observation
done: if game done.
"""
item = (state, action, reward, next_state, done)
self.memory_buffer.append(item)
def update_epsilon(self):
"""update epsilon
"""
if self.epsilon >= self.epsilon_min:
self.epsilon *= self.epsilon_decay
def process_batch(self, batch):
"""process batch data
Arguments:
batch: batch size
Returns:
X: states
y: [Q_value1, Q_value2]
"""
# ranchom choice batch data from experience replay.
data = random.sample(self.memory_buffer, batch)
# Q_target。
states = np.array([d[0] for d in data])
next_states = np.array([d[3] for d in data])
y = self.model.predict(states)
q = self.model.predict(next_states)
for i, (_, action, reward, _, done) in enumerate(data):
target = reward
if not done:
target += self.gamma * np.amax(q[i])
y[i][action] = target
return states, y
def train(self, episode, batch):
"""training
Arguments:
episode: game episode
batch: batch size
Returns:
history: training history
"""
history = {'episode': [], 'Episode_reward': [], 'Loss': []}
episode_all = []
Epispde_reward_all = []
Loss_all = []
count = 0
for i in range(episode):
observation = self.env.reset()
reward_sum = 0
loss = np.infty
done = False
self.env.render()
while not done:
# chocie action from ε-greedy.
self.env.render()
x = observation.reshape(-1, 4)
action = self.egreedy_action(x)
observation, reward, done, _ = self.env.step(action)
# add data to experience replay.
reward_sum += reward
self.remember(x[0], action, reward, observation, done)
if len(self.memory_buffer) > batch:
X, y = self.process_batch(batch)
loss = self.model.train_on_batch(X, y)
count += 1
# reduce epsilon pure batch.
self.update_epsilon()
if i % 5 == 0:
history['episode'].append(i)
history['Episode_reward'].append(reward_sum)
history['Loss'].append(loss)
print('Episode: {} | Episode reward: {} | loss: {:.3f} | e:{:.2f}'.format(i, reward_sum, loss, self.epsilon))
episode_all.append(i)
Epispde_reward_all.append(reward_sum)
Loss_all.append(loss)
self.model.save_weights('model/dqn.h5')
return history,episode_all,Epispde_reward_all,Loss_all
if __name__ == '__main__':
model = DQN()
history,episode_all,Epispde_reward_all,Loss_all = model.train(600, 32)
model.save_history(history, 'dqn.csv')
model.load()
model.play()
plt.figure(1)
plt.xlabel('epoch')
plt.ylabel('reward')
plt.title('reward')
plt.plot(episode_all,Epispde_reward_all)
plt.show()
plt.figure(2)
plt.xlabel('epoch')
plt.ylabel('Loss')
plt.title('Loss')
plt.plot(episode_all,Loss_all)
plt.show()
DQN的原理這裏就不講了,網上有很多資料,這裏,主要給出代碼,我覺得代碼結構很清晰,分爲四個部分即可完成DQN的搭建:
1.使用 model = DQN()初始化網絡結構,
2.使用history,episode_all,Epispde_reward_all,Loss_all = model.train(600, 32)訓練網絡
3.model.load()導入保存的模型
4.model.play()使用保存的模型參數直接控制倒立擺。