使用策略网络玩游戏
# hyperparameters
image_size = 80
D = image_size * image_size
H = 200
batch_size = 10
learning_rate = 1e-4
gamma = 0.99
decay_rate = 0.99
render = False # display the game environment
# resume = True # load existing policy network
model_file_name = "model_pong"
np.set_printoptions(threshold=np.nan)
def prepro(I):
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
I = I[35:195]
I = I[::2,::2,0]
I[I == 144] = 0
I[I == 109] = 0
I[I != 0] = 1
return I.astype(np.float).ravel()
使用tensorlayer这种的第三方库,定义一个策略网络,输入的是一个状态,定义为游戏中两个帧之间的差,输出为三个动作对应的概率,对应为上,下,保持原位。cross entroy reward loss 为一个交叉熵损失,预测值为:当前的输出的概率,对应的标签:state计算得到的action中按照概率大小采样的到的行为。sess.run(train_op,feed_dict={t_states: epx,t_actions: epy,t_discount_rewards: disR})
xs, ys, rs = [], [], []
# observation for training and inference
t_states = tf.placeholder(tf.float32, shape=[None, D])
# policy network
network = InputLayer(t_states, name='input')
network = DenseLayer(network, n_units=H, act=tf.nn.relu, name='hidden')
network = DenseLayer(network, n_units=3, name='output')
probs = network.outputs
sampling_prob = tf.nn.softmax(probs)
t_actions = tf.placeholder(tf.int32, shape=[None])
t_discount_rewards = tf.placeholder(tf.float32, shape=[None])
loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards)
train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
with tf.Session() as sess:
tl.layers.initialize_global_variables(sess)
tl.files.load_and_assign_npz(sess, model_file_name+'.npz', network)
network.print_params()
network.print_layers()
start_time = time.time()
game_number = 0
while True:
if render: env.render()
cur_x = prepro(observation)
x = cur_x - prev_x if prev_x is not None else np.zeros(D)
x = x.reshape(1, D)
prev_x = cur_x
prob = sess.run(sampling_prob,feed_dict={t_states: x})
# action. 1: STOP 2: UP 3: DOWN
# action = np.random.choice([1,2,3], p=prob.flatten())
action = tl.rein.choice_action_by_probs(prob.flatten(), [1,2,3])
observation, reward, done, _ = env.step(action)
reward_sum += reward
xs.append(x) # all observations in a episode
ys.append(action - 1) # all fake labels in a episode (action begins from 1, so minus 1)
rs.append(reward) # all rewards in a episode
if done:
episode_number += 1
game_number = 0
if episode_number % batch_size == 0:
print('batch over...... updating parameters......')
epx = np.vstack(xs);epy = np.asarray(ys);epr = np.asarray(rs)
disR = tl.rein.discount_episode_rewards(epr, gamma)
disR -= np.mean(disR)
disR /= np.std(disR)
xs, ys, rs = [], [], []
sess.run(train_op,feed_dict={t_states: epx,t_actions: epy,t_discount_rewards: disR})
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
reward_sum = 0
observation = env.reset() # reset env
prev_x = None