TD3代码实现
文章目录
代码及解释
0.运行环境
设备/包 | 版本 |
---|---|
python | 3.7.11 |
显卡 | GTX 1050 |
CUDA | 10.2 |
cudnn | 7.6.5 |
cudatoolkit | 10.0.130 |
tensorflow-gpu | 2.2.0 |
tensorlayer | 2.2.3 |
tensorflow-probability | 0.9.0 |
1.包引入与参数设定
import argparse
import os
import random
import time
import gym
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import tensorlayer as tl
from tensorlayer.layers import Dense
from tensorlayer.models import Model
from matplotlib import animation
import matplotlib.pyplot as plt
Normal = tfp.distributions.Normal
parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', default=False)
parser.add_argument('--render', dest='render', default=False)
parser.add_argument('--save_gif', type=bool, default=False)
parser.add_argument('--train_episodes', type=int, default=2000)
parser.add_argument('--test_episodes', type=int, default=10)
# 训练时,一盘游戏最多进行max_steps步
parser.add_argument('--max_steps', type=int, default=200)
# 在开始训练时,前explore_steps步的动作都由随机抽样决定
parser.add_argument('--explore_steps', type=int, default=500)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--replay_buffer_size', type=int, default=5e5)
# 隐藏层维度
parser.add_argument('--hidden_dim', type=int, default=64)
# 训练时,每执行一步动作更新几次参数
parser.add_argument('--update_itr', type=int, default=3)
# 策略网络延迟更新
parser.add_argument('--delayed_update_itr', type=int, default=3)
parser.add_argument('--q_lr', type=float, default=3e-4)
parser.add_argument('--policy_lr', type=float, default=3e-4)
parser.add_argument('--gamma', type=float, default=0.95)
#软更新参数τ,更新target时用
parser.add_argument('--tau', type=float, default=0.01)
# 探索策略噪声范围
parser.add_argument('--explore_noise_scale', type=float, default=1.0)
# 计算梯度时策略噪声范围
parser.add_argument('--eval_noise_scale', type=float, default=0.5)
parser.add_argument('--reward_scale', type=float, default=1.0)
args = parser.parse_args()
ALG_NAME = 'TD3'
ENV_ID = 'Pendulum-v0' # environment id
RANDOM_SEED = 2 # random seed
Normal = tfp.distributions.Normal
定义了一个正态分布,通过normal = Normal(0,1)
可以生成特定的正态分布模型。
2.class ReplayBuffer
- 具体实现与DQN(代码及解释在末尾)相同。
class ReplayBuffer:
def __init__(self, capacity=10000):
self.capacity = capacity
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = int((self.position + 1) % self.capacity)
def sample(self, batch_size = args.batch_size):
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = map(np.stack, zip(*batch))
return state, action, reward, next_state, done
def __len__(self):
return len(self.buffer)
3.class QNetwork(Model)
-
QNetwork用于近似状态价值函数Q(s,a)
-
QNetwork需要实现两个方法。
- _init_:神经网络网络初始化。
- forward:forward函数的任务需要把输入层、网络层、输出层链接起来,实现信息的前向传导。forward方法是必须要重写的,它是实现模型的功能,实现各个层之间的连接关系的核心。
3.1._init_
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
super(QNetwork, self).__init__()
input_dim = state_dim + action_dim
w_init = tf.random_uniform_initializer(-init_w, init_w)
self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1')
self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2')
self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3')
3.2.forward
def forward(self, input):
x = self.linear1(input)
x = self.linear2(x)
x = self.linear3(x)
return x
4.class PolicyNetwork(Model)
-
PolicyNetwork用于近似确定性策略函数π(a|s)。
-
PolicyNetwork类需要实现5个方法。
-
_init_:神经网络网络初始化。
-
forward:向前传播函数。
-
evaluate:在计算梯度时获取动作,这个动作不是真正的动作,通常是at+1。
-
get_action:在与环境交互时获取动作。
-
sample_action:随机选择一个动作。
-
4.1._init_
def __init__(self, state_dim, action_dim, hidden_dim, action_range=1., init_w=3e-3):
super(PolicyNetwork, self).__init__()
w_init = tf.random_uniform_initializer(-init_w, init_w)
self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=state_dim, name='policy1')
self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
self.output_linear = Dense(
n_units=action_dim, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
in_channels=hidden_dim, name='policy_output'
)
self.action_range = action_range
self.action_dim = action_dim
4.2.forward
def forward(self, state):
x = self.linear1(state)
x = self.linear2(x)
x = self.linear3(x)
output = tf.nn.tanh(self.output_linear(x)) # unit range output [-1, 1]
return output
4.3.evaluate
def evaluate(self, state, eval_noise_scale):
"""
generate action with state for calculating gradients;
eval_noise_scale: as the trick of target policy smoothing, for generating noisy actions.
"""
state = state.astype(np.float32)
action = self.forward(state)
action = self.action_range * action
# add noise
normal = Normal(0, 1)
noise = normal.sample(action.shape) * eval_noise_scale
eval_noise_clip = 2 * eval_noise_scale
noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip)
action = action + noise
return action
- evaluate方法是在更新策略梯度时使用的,我们不希望抽样出过于冒险、与实际太过于不同的行动。因此对噪声做了截断操作,否则会有很小的概率,抽样出很大或者很小的值。
4.4.get_action
def get_action(self, state, explore_noise_scale, greedy =False):
""" generate action with state for interaction with environment """
action = self.forward([state])
action = self.action_range * action.numpy()[0]
if greedy:
return action
# add noise
normal = Normal(0, 1)
noise = normal.sample(action.shape) * explore_noise_scale
action += noise
return action.numpy()
- get_action方法是在与环境交互时使用的,不对噪声进行截断,反而增加了agent的探索能力。
4.5.sample_action
def sample_action(self):
""" generate random actions for exploration """
a = tf.random.uniform([self.action_dim], -1, 1)
return self.action_range * a.numpy()
5.class TD3
- TD3算法类需要实现8个方法:
- _init_:神经网络网络初始化。
- target_ini:target网络初始化。
- target_soft_update:对target网络进行软更新。
- update:更新所有网络参数。
- saveModel:保存模型。
- loadModel:加载模型。
5.1._init_
- 需要初始化6个网络:两个不同的价值网络和一个策略网络,以及它们对应的target network。
def __init__(
self, state_dim, action_dim, action_range, hidden_dim, replay_buffer, policy_target_update_interval=1,
q_lr=3e-4, policy_lr=3e-4
):
self.replay_buffer = replay_buffer
# initialize all networks
self.q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
self.q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
self.target_q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
self.target_q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
print('Q Network (1,2): ', self.q_net1)
print('Policy Network: ', self.policy_net)
# initialize weights of target networks
self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1)
self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2)
self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net)
# set train mode
self.q_net1.train()
self.q_net2.train()
self.target_q_net1.eval()
self.target_q_net2.eval()
self.policy_net.train()
self.target_policy_net.eval()
self.update_cnt = 0
self.policy_target_update_interval = policy_target_update_interval
self.q_optimizer1 = tf.optimizers.Adam(q_lr)
self.q_optimizer2 = tf.optimizers.Adam(q_lr)
self.policy_optimizer = tf.optimizers.Adam(policy_lr)
5.2.target_ini
- 硬更新:直接把神经网络的参数复制给对应的target network
def target_ini(self, net, target_net):
""" hard-copy update for initializing target networks """
for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
target_param.assign(param)
return target_net
5.3.target_soft_update
- 软更新:θ-=(1-τ)·θ + τ·θ-
def target_soft_update(self, net, target_net, soft_tau):
""" soft update the target net with Polyak averaging """
for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
target_param.assign( # copy weight value into target parameters
target_param * (1.0 - soft_tau) + param * soft_tau
)
return target_net
5.4.update
- update方法主要做以下几件事。
- 从replay buffer取出batch_size个transition。
- 用target policy network求出下一个动作。
- 与DQN with Target相同,都是用target网络进行动作选择,用target网络进行价值评估。
- 标准化reward。
- 在给定transition的情况下,取两个网络较小的值,并求出TD target。
- 使用TD算法更新两个价值网络。
- 使用梯度上升更新策略网络。
- 软更新target network。
def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2):
""" update all networks in TD3 """
self.update_cnt += 1
state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
reward = reward[:, np.newaxis] # expand dim
done = done[:, np.newaxis]
new_next_action = self.target_policy_net.evaluate(
next_state, eval_noise_scale=eval_noise_scale
) # clipped normal noise
reward = reward_scale * (reward - np.mean(reward, axis=0)) / (
np.std(reward, axis=0) + 1e-6
) # normalize with batch mean and std; plus a small number to prevent numerical problem
# Training Q Function
target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples
target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input))
#TD target
target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward
q_input = tf.concat([state, action], 1) # input of q_net
with tf.GradientTape() as q1_tape:
predicted_q_value1 = self.q_net1(q_input)
q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value))
q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights)
self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights))
with tf.GradientTape() as q2_tape:
predicted_q_value2 = self.q_net2(q_input)
q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value))
q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights)
self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights))
# Training Policy Function
if self.update_cnt % self.policy_target_update_interval == 0:
with tf.GradientTape() as p_tape:
# 更新actor的时候,我们不需要加上noise,这里是希望actor能够寻着最大值。加上noise并没有任何意义
new_action = self.policy_net.evaluate(
state, eval_noise_scale=0.0
) # no noise, deterministic policy gradients
new_q_input = tf.concat([state, new_action], 1)
# """ implementation 1 """
# predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input))
""" implementation 2 """
predicted_new_q_value = self.q_net1(new_q_input)
policy_loss = -tf.reduce_mean(predicted_new_q_value)
p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))
# Soft update the target nets
self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau)
self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau)
self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)
5.5.saveModel
def saveModel(self):
path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
if not os.path.exists(path):
os.makedirs(path)
extend_path = lambda s: os.path.join(path, s)
tl.files.save_npz(self.q_net1.trainable_weights, extend_path('model_q_net1.npz'))
tl.files.save_npz(self.q_net2.trainable_weights, extend_path('model_q_net2.npz'))
tl.files.save_npz(self.target_q_net1.trainable_weights, extend_path('model_target_q_net1.npz'))
tl.files.save_npz(self.target_q_net2.trainable_weights, extend_path('model_target_q_net2.npz'))
tl.files.save_npz(self.policy_net.trainable_weights, extend_path('model_policy_net.npz'))
tl.files.save_npz(self.target_policy_net.trainable_weights, extend_path('model_target_policy_net.npz'))
print('Saved weights.')
5.6.loadModel
def loadModel(self):
path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
if os.path.exists(path):
print('Load DQN Network parametets ...')
extend_path = lambda s: os.path.join(path, s)
tl.files.load_and_assign_npz(extend_path('model_q_net1.npz'), self.q_net1)
tl.files.load_and_assign_npz(extend_path('model_q_net2.npz'), self.q_net2)
tl.files.load_and_assign_npz(extend_path('model_target_q_net1.npz'), self.target_q_net1)
tl.files.load_and_assign_npz(extend_path('model_target_q_net2.npz'), self.target_q_net2)
tl.files.load_and_assign_npz(extend_path('model_policy_net.npz'), self.policy_net)
tl.files.load_and_assign_npz(extend_path('model_target_policy_net.npz'), self.target_policy_net)
print('Load weights!')
else:
print("No model file find, please train model first...")
6.主程序
6.1.帧画面转化为gif函数
def display_frames_as_gif(frames, path):
patch = plt.imshow(frames[0])
plt.axis('off')
def animate(i):
patch.set_data(frames[i])
anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=5)
anim.save(path, writer='pillow', fps=30)
6.2.main函数
if __name__ == '__main__':
# initialization of env
env = gym.make(ENV_ID)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_range = env.action_space.high # scale action, [-action_range, action_range]
# reproducible
env.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
# initialization of buffer
replay_buffer = ReplayBuffer(args.replay_buffer_size)
# initialization of trainer
agent = TD3(
state_dim, action_dim, action_range, args.hidden_dim, replay_buffer, args.delayed_update_itr, args.q_lr, args.policy_lr
)
t0 = time.time()
# training loop
agent.loadModel()
if args.train:
frame_idx = 0
all_episode_reward = []
# need an extra call here to make inside functions be able to use model.forward
state = env.reset().astype(np.float32)
agent.policy_net([state])
agent.target_policy_net([state])
for episode in range(args.train_episodes):
state = env.reset().astype(np.float32)
episode_reward = 0
for step in range(args.max_steps):
if args.render:
env.render()
if frame_idx > args.explore_steps:
action = agent.policy_net.get_action(state, args.explore_noise_scale)
else:
action = agent.policy_net.sample_action()
next_state, reward, done, _ = env.step(action)
next_state = next_state.astype(np.float32)
done = 1 if done is True else 0
replay_buffer.push(state, action, reward, next_state, done)
state = next_state
episode_reward += reward
frame_idx += 1
if len(replay_buffer) > args.batch_size:
for i in range(args.update_itr):
agent.update(args.batch_size, args.eval_noise_scale, args.reward_scale, args.gamma, args.tau)
if done:
break
if episode == 0:
all_episode_reward.append(episode_reward)
else:
all_episode_reward.append(all_episode_reward[-1] * 0.9 + episode_reward * 0.1)
print(
'Training | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
episode + 1, args.train_episodes, episode_reward,
time.time() - t0
)
)
# 一百轮保存一遍模型
if episode % 100 == 0:
agent.saveModel()
plt.plot(all_episode_reward)
if not os.path.exists('image'):
os.makedirs('image')
plt.savefig(os.path.join('image', '_'.join([ALG_NAME, ENV_ID])))
else:
# need an extra call here to make inside functions be able to use model.forward
state = env.reset().astype(np.float32)
agent.policy_net([state])
for episode in range(args.test_episodes):
state = env.reset().astype(np.float32)
episode_reward = 0
frames = []
for step in range(args.max_steps):
env.render()
frames.append(env.render(mode='rgb_array'))
action = agent.policy_net.get_action(state, args.explore_noise_scale, greedy=True)
state, reward, done, info = env.step(action)
state = state.astype(np.float32)
episode_reward += reward
if done:
break
print(
'Testing | Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
episode + 1, args.test_episodes, episode_reward,
time.time() - t0
)
)
# 将本场游戏保存为gif
if args.save_gif:
dir_path = os.path.join('testVideo', '_'.join([ALG_NAME, ENV_ID]))
if not os.path.exists(dir_path):
os.makedirs(dir_path)
display_frames_as_gif(frames, dir_path + '\\' + str(episode) + ".gif")
env.close()
训练结果
2000次