tensorlayer學習日誌17_chapter7_7.2

第七章的乒乓球~~

import time
import gym
import numpy as np
import tensorflow as tf
import tensorlayer as tl
from tensorlayer.layers import *

def prepro(I):
    I = I[35:195]
    I = I[::2, ::2, 0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0] = 1
    return I.astype(np.float).ravel()

image_size = 80
D = image_size * image_size
t_states = tf.placeholder(tf.float32, shape=[None, D])
network = InputLayer(t_states, name='input')
network = DenseLayer(network, n_units=200, act=tf.nn.relu, name='hidden')
network = DenseLayer(network, n_units=3, name='output')
probs = network.outputs
sampling_prob = tf.nn.softmax(probs)

batch_size = 10
learning_rate = 1e-4
gamma = 0.99
decay_rate = 0.99
render = False  
# resume = True    
model_file_name = "model_pong72"

t_actions = tf.placeholder(tf.int32, shape=[None])
t_discount_rewards = tf.placeholder(tf.float32, shape=[None])
loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards)
train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)

# np.set_printoptions(threshold=np.nan)
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None
running_reward = None
reward_sum = 0
episode_number = 0
xs, ys, rs = [], [], []

start_time = time.time()
game_number = 0

sess = tf.InteractiveSession()

tl.layers.initialize_global_variables(sess)
# if resume:
#     load_params = tl.files.load_npz(name=model_file_name+'.npz')
#     tl.files.assign_params(sess, load_params, network)
tl.files.load_and_assign_npz(sess, model_file_name + '.npz', network)
network.print_params()
network.print_layers()


while True:
    if render:
        env.render()

    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    x = x.reshape(1, D)
    prev_x = cur_x

    prob = sess.run(sampling_prob, feed_dict={t_states: x})
    action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3])

    observation, reward, done, _ = env.step(action)
    reward_sum += reward
    xs.append(x)  
    ys.append(action - 1)  
    rs.append(reward)  

    if done:
        episode_number += 1
        game_number = 0

        if episode_number % batch_size == 0:
            print('batch over...... updating parameters......')
            epx = np.vstack(xs)
            epy = np.asarray(ys)
            epr = np.asarray(rs)
            disR = tl.rein.discount_episode_rewards(epr, gamma)
            disR -= np.mean(disR)
            disR /= np.std(disR)

            xs, ys, rs = [], [], []

            sess.run(train_op, feed_dict={t_states: epx, t_actions: epy, t_discount_rewards: disR})

        if episode_number % (batch_size * 100) == 0:
            tl.files.save_npz(network.all_params, name=model_file_name + '.npz')

        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
        reward_sum = 0
        observation = env.reset()  
        prev_x = None

    if reward != 0:
        print(('episode %d: game %d took %.5fs, reward: %f' %
                (episode_number, game_number, time.time() - start_time, reward)
            ), ('' if reward == -1 else ' !!!!!!!!'))
        start_time = time.time()
        game_number += 1

如果簡單用pip install gym[atari]時是會有報錯的,報錯如下:

意思是沒法make files 造成錯誤,據說在mac和 linux 上是可以的,沒辦法我用的是 window,後來百度找到了方法,用  pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py

就可以了,如下:

運行輸出是這樣的:教程上說練上20000輪纔會開始能看到效果喔

[TL] InputLayer  input: (?, 6400)
[TL] DenseLayer  hidden: 200 relu
[TL] DenseLayer  output: 3 identity
[TL] [!] Load model_pong72.npz failed!
[TL]   param   0: hidden/W:0           (6400, 200)        float32_ref (mean: -4.545314368442632e-06, median: -0.00012795478687621653, std: 0.08794427663087845)   
[TL]   param   1: hidden/b:0           (200,)             float32_ref (mean: 0.0               , median: 0.0               , std: 0.0               )   
[TL]   param   2: output/W:0           (200, 3)           float32_ref (mean: -0.0008740616030991077, median: -0.002246866002678871, std: 0.08691024035215378)   
[TL]   param   3: output/b:0           (3,)               float32_ref (mean: 0.0               , median: 0.0               , std: 0.0               )   
[TL]   num of params: 1280803
[TL]   layer   0: hidden/Relu:0        (?, 200)           float32
[TL]   layer   1: output/Identity:0    (?, 3)             float32
episode 0: game 0 took 0.35085s, reward: -1.000000 
episode 0: game 1 took 0.07739s, reward: -1.000000 
episode 0: game 2 took 0.08256s, reward: -1.000000 
episode 0: game 3 took 0.06930s, reward: -1.000000 
episode 0: game 4 took 0.09142s, reward: -1.000000 
episode 0: game 5 took 0.08449s, reward: -1.000000 
episode 0: game 6 took 0.07453s, reward: -1.000000 
episode 0: game 7 took 0.07654s, reward: -1.000000 
episode 0: game 8 took 0.07322s, reward: -1.000000 
episode 0: game 9 took 0.08227s, reward: -1.000000 
episode 0: game 10 took 0.06737s, reward: -1.000000 
episode 0: game 11 took 0.08508s, reward: -1.000000 
episode 0: game 12 took 0.06526s, reward: -1.000000 
episode 0: game 13 took 0.08321s, reward: -1.000000 
episode 0: game 14 took 0.06588s, reward: -1.000000 
episode 0: game 15 took 0.09627s, reward: -1.000000 
episode 0: game 16 took 0.07620s, reward: -1.000000 
episode 0: game 17 took 0.07818s, reward: -1.000000 
episode 0: game 18 took 0.07424s, reward: -1.000000 
episode 0: game 19 took 0.07847s, reward: -1.000000 
resetting env. episode reward total was -21.000000. running mean: -21.000000
~~~~~~~~~
~~~~~~~~
episode 169: game 21 took 0.10227s, reward: -1.000000 
batch over...... updating parameters......
2018-08-22 14:11:13.661716: W tensorflow/core/framework/allocator.cc:101] Allocation of 319283200 exceeds 10% of system memory.
resetting env. episode reward total was -20.000000. running mean: -20.599237

~~~~~~~~~~~
~~~~~~~~~

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章