第七章的乒乓球~~
import time
import gym
import numpy as np
import tensorflow as tf
import tensorlayer as tl
from tensorlayer.layers import *
def prepro(I):
I = I[35:195]
I = I[::2, ::2, 0]
I[I == 144] = 0
I[I == 109] = 0
I[I != 0] = 1
return I.astype(np.float).ravel()
image_size = 80
D = image_size * image_size
t_states = tf.placeholder(tf.float32, shape=[None, D])
network = InputLayer(t_states, name='input')
network = DenseLayer(network, n_units=200, act=tf.nn.relu, name='hidden')
network = DenseLayer(network, n_units=3, name='output')
probs = network.outputs
sampling_prob = tf.nn.softmax(probs)
batch_size = 10
learning_rate = 1e-4
gamma = 0.99
decay_rate = 0.99
render = False
# resume = True
model_file_name = "model_pong72"
t_actions = tf.placeholder(tf.int32, shape=[None])
t_discount_rewards = tf.placeholder(tf.float32, shape=[None])
loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards)
train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
# np.set_printoptions(threshold=np.nan)
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None
running_reward = None
reward_sum = 0
episode_number = 0
xs, ys, rs = [], [], []
start_time = time.time()
game_number = 0
sess = tf.InteractiveSession()
tl.layers.initialize_global_variables(sess)
# if resume:
# load_params = tl.files.load_npz(name=model_file_name+'.npz')
# tl.files.assign_params(sess, load_params, network)
tl.files.load_and_assign_npz(sess, model_file_name + '.npz', network)
network.print_params()
network.print_layers()
while True:
if render:
env.render()
cur_x = prepro(observation)
x = cur_x - prev_x if prev_x is not None else np.zeros(D)
x = x.reshape(1, D)
prev_x = cur_x
prob = sess.run(sampling_prob, feed_dict={t_states: x})
action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3])
observation, reward, done, _ = env.step(action)
reward_sum += reward
xs.append(x)
ys.append(action - 1)
rs.append(reward)
if done:
episode_number += 1
game_number = 0
if episode_number % batch_size == 0:
print('batch over...... updating parameters......')
epx = np.vstack(xs)
epy = np.asarray(ys)
epr = np.asarray(rs)
disR = tl.rein.discount_episode_rewards(epr, gamma)
disR -= np.mean(disR)
disR /= np.std(disR)
xs, ys, rs = [], [], []
sess.run(train_op, feed_dict={t_states: epx, t_actions: epy, t_discount_rewards: disR})
if episode_number % (batch_size * 100) == 0:
tl.files.save_npz(network.all_params, name=model_file_name + '.npz')
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
reward_sum = 0
observation = env.reset()
prev_x = None
if reward != 0:
print(('episode %d: game %d took %.5fs, reward: %f' %
(episode_number, game_number, time.time() - start_time, reward)
), ('' if reward == -1 else ' !!!!!!!!'))
start_time = time.time()
game_number += 1
如果簡單用pip install gym[atari]時是會有報錯的,報錯如下:
意思是沒法make files 造成錯誤,據說在mac和 linux 上是可以的,沒辦法我用的是 window,後來百度找到了方法,用 pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py
就可以了,如下:
運行輸出是這樣的:教程上說練上20000輪纔會開始能看到效果喔
[TL] InputLayer input: (?, 6400)
[TL] DenseLayer hidden: 200 relu
[TL] DenseLayer output: 3 identity
[TL] [!] Load model_pong72.npz failed!
[TL] param 0: hidden/W:0 (6400, 200) float32_ref (mean: -4.545314368442632e-06, median: -0.00012795478687621653, std: 0.08794427663087845)
[TL] param 1: hidden/b:0 (200,) float32_ref (mean: 0.0 , median: 0.0 , std: 0.0 )
[TL] param 2: output/W:0 (200, 3) float32_ref (mean: -0.0008740616030991077, median: -0.002246866002678871, std: 0.08691024035215378)
[TL] param 3: output/b:0 (3,) float32_ref (mean: 0.0 , median: 0.0 , std: 0.0 )
[TL] num of params: 1280803
[TL] layer 0: hidden/Relu:0 (?, 200) float32
[TL] layer 1: output/Identity:0 (?, 3) float32
episode 0: game 0 took 0.35085s, reward: -1.000000
episode 0: game 1 took 0.07739s, reward: -1.000000
episode 0: game 2 took 0.08256s, reward: -1.000000
episode 0: game 3 took 0.06930s, reward: -1.000000
episode 0: game 4 took 0.09142s, reward: -1.000000
episode 0: game 5 took 0.08449s, reward: -1.000000
episode 0: game 6 took 0.07453s, reward: -1.000000
episode 0: game 7 took 0.07654s, reward: -1.000000
episode 0: game 8 took 0.07322s, reward: -1.000000
episode 0: game 9 took 0.08227s, reward: -1.000000
episode 0: game 10 took 0.06737s, reward: -1.000000
episode 0: game 11 took 0.08508s, reward: -1.000000
episode 0: game 12 took 0.06526s, reward: -1.000000
episode 0: game 13 took 0.08321s, reward: -1.000000
episode 0: game 14 took 0.06588s, reward: -1.000000
episode 0: game 15 took 0.09627s, reward: -1.000000
episode 0: game 16 took 0.07620s, reward: -1.000000
episode 0: game 17 took 0.07818s, reward: -1.000000
episode 0: game 18 took 0.07424s, reward: -1.000000
episode 0: game 19 took 0.07847s, reward: -1.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
~~~~~~~~~
~~~~~~~~
episode 169: game 21 took 0.10227s, reward: -1.000000
batch over...... updating parameters......
2018-08-22 14:11:13.661716: W tensorflow/core/framework/allocator.cc:101] Allocation of 319283200 exceeds 10% of system memory.
resetting env. episode reward total was -20.000000. running mean: -20.599237
~~~~~~~~~~~
~~~~~~~~~