TD3代码实现

TD3代码实现

代码及解释

0.运行环境

设备/包 版本
python 3.7.11
显卡 GTX 1050
CUDA 10.2
cudnn 7.6.5
cudatoolkit 10.0.130
tensorflow-gpu 2.2.0
tensorlayer 2.2.3
tensorflow-probability 0.9.0

1.包引入与参数设定

import argparse
import os
import random
import time

import gym
import numpy as np
import tensorflow as tf

import tensorflow_probability as tfp
import tensorlayer as tl
from tensorlayer.layers import Dense
from tensorlayer.models import Model

from matplotlib import animation
import matplotlib.pyplot as plt

Normal = tfp.distributions.Normal

parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', default=False)
parser.add_argument('--render', dest='render', default=False)
parser.add_argument('--save_gif', type=bool, default=False)

parser.add_argument('--train_episodes', type=int, default=2000)
parser.add_argument('--test_episodes', type=int, default=10)
# 训练时,一盘游戏最多进行max_steps步
parser.add_argument('--max_steps', type=int, default=200)
# 在开始训练时,前explore_steps步的动作都由随机抽样决定
parser.add_argument('--explore_steps', type=int, default=500)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--replay_buffer_size', type=int, default=5e5)

# 隐藏层维度
parser.add_argument('--hidden_dim', type=int, default=64)
# 训练时,每执行一步动作更新几次参数
parser.add_argument('--update_itr', type=int, default=3)
# 策略网络延迟更新
parser.add_argument('--delayed_update_itr', type=int, default=3)
parser.add_argument('--q_lr', type=float, default=3e-4)
parser.add_argument('--policy_lr', type=float, default=3e-4)
parser.add_argument('--gamma', type=float, default=0.95)
#软更新参数τ,更新target时用
parser.add_argument('--tau', type=float, default=0.01)
# 探索策略噪声范围
parser.add_argument('--explore_noise_scale', type=float, default=1.0)
# 计算梯度时策略噪声范围
parser.add_argument('--eval_noise_scale', type=float, default=0.5)
parser.add_argument('--reward_scale', type=float, default=1.0)

args = parser.parse_args()

ALG_NAME = 'TD3'
ENV_ID = 'Pendulum-v0'  # environment id
RANDOM_SEED = 2  # random seed
  • Normal = tfp.distributions.Normal定义了一个正态分布,通过normal = Normal(0,1)可以生成特定的正态分布模型。

2.class ReplayBuffer

  • 具体实现与DQN(代码及解释在末尾)相同。
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = int((self.position + 1) % self.capacity)

    def sample(self, batch_size = args.batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

3.class QNetwork(Model)

  • QNetwork用于近似状态价值函数Q(s,a)

  • QNetwork需要实现两个方法。

    • _init_:神经网络网络初始化。
    • forward:forward函数的任务需要把输入层、网络层、输出层链接起来,实现信息的前向传导。forward方法是必须要重写的,它是实现模型的功能,实现各个层之间的连接关系的核心。
3.1._init_
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
    super(QNetwork, self).__init__()
    input_dim = state_dim + action_dim
    w_init = tf.random_uniform_initializer(-init_w, init_w)

    self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1')
    self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2')
    self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3')
3.2.forward
def forward(self, input):
    x = self.linear1(input)
    x = self.linear2(x)
    x = self.linear3(x)
    return x

4.class PolicyNetwork(Model)

  • PolicyNetwork用于近似确定性策略函数π(a|s)。

  • PolicyNetwork类需要实现5个方法。

    • _init_:神经网络网络初始化。

    • forward:向前传播函数。

    • evaluate:在计算梯度时获取动作,这个动作不是真正的动作,通常是at+1

    • get_action:在与环境交互时获取动作。

    • sample_action:随机选择一个动作。

4.1._init_
def __init__(self, state_dim, action_dim, hidden_dim, action_range=1., init_w=3e-3):
    super(PolicyNetwork, self).__init__()
    w_init = tf.random_uniform_initializer(-init_w, init_w)

    self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=state_dim, name='policy1')
    self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
    self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
    self.output_linear = Dense(
        n_units=action_dim, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
        in_channels=hidden_dim, name='policy_output'
    )
    self.action_range = action_range
    self.action_dim = action_dim
4.2.forward
def forward(self, state):
    x = self.linear1(state)
    x = self.linear2(x)
    x = self.linear3(x)
    output = tf.nn.tanh(self.output_linear(x))  # unit range output [-1, 1]
    return output
4.3.evaluate
def evaluate(self, state, eval_noise_scale):
    """
    generate action with state for calculating gradients;
    eval_noise_scale: as the trick of target policy smoothing, for generating noisy actions.
        """
    state = state.astype(np.float32)
    action = self.forward(state)

    action = self.action_range * action

    # add noise
    normal = Normal(0, 1)
    noise = normal.sample(action.shape) * eval_noise_scale
    eval_noise_clip = 2 * eval_noise_scale
    noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip)
    action = action + noise
    return action
  • evaluate方法是在更新策略梯度时使用的,我们不希望抽样出过于冒险、与实际太过于不同的行动。因此对噪声做了截断操作,否则会有很小的概率,抽样出很大或者很小的值。
4.4.get_action
def get_action(self, state, explore_noise_scale, greedy =False):
    """ generate action with state for interaction with environment """
    action = self.forward([state])
    action = self.action_range * action.numpy()[0]
    if greedy:
        return action
    # add noise
    normal = Normal(0, 1)
    noise = normal.sample(action.shape) * explore_noise_scale
    action += noise
    return action.numpy()
  • get_action方法是在与环境交互时使用的,不对噪声进行截断,反而增加了agent的探索能力。
4.5.sample_action
def sample_action(self):
    """ generate random actions for exploration """
    a = tf.random.uniform([self.action_dim], -1, 1)
    return self.action_range * a.numpy()

5.class TD3

  • TD3算法类需要实现8个方法:
    • _init_:神经网络网络初始化。
    • target_ini:target网络初始化。
    • target_soft_update:对target网络进行软更新。
    • update:更新所有网络参数。
    • saveModel:保存模型。
    • loadModel:加载模型。
5.1._init_
  • 需要初始化6个网络:两个不同的价值网络和一个策略网络,以及它们对应的target network。
def __init__(
    self, state_dim, action_dim, action_range, hidden_dim, replay_buffer, policy_target_update_interval=1,
    q_lr=3e-4, policy_lr=3e-4
):
    self.replay_buffer = replay_buffer

    # initialize all networks
    self.q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
    self.q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
    self.target_q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
    self.target_q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
    self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
    self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
    print('Q Network (1,2): ', self.q_net1)
    print('Policy Network: ', self.policy_net)

    # initialize weights of target networks
    self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1)
    self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2)
    self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net)

    # set train mode
    self.q_net1.train()
    self.q_net2.train()
    self.target_q_net1.eval()
    self.target_q_net2.eval()
    self.policy_net.train()
    self.target_policy_net.eval()

    self.update_cnt = 0
    self.policy_target_update_interval = policy_target_update_interval

    self.q_optimizer1 = tf.optimizers.Adam(q_lr)
    self.q_optimizer2 = tf.optimizers.Adam(q_lr)
    self.policy_optimizer = tf.optimizers.Adam(policy_lr)
5.2.target_ini
  • 硬更新:直接把神经网络的参数复制给对应的target network
def target_ini(self, net, target_net):
    """ hard-copy update for initializing target networks """
    for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
        target_param.assign(param)
    return target_net
5.3.target_soft_update
  • 软更新:θ-=(1-τ)·θ + τ·θ-
def target_soft_update(self, net, target_net, soft_tau):
    """ soft update the target net with Polyak averaging """
    for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
        target_param.assign(  # copy weight value into target parameters
            target_param * (1.0 - soft_tau) + param * soft_tau
        )
	return target_net
5.4.update
  • update方法主要做以下几件事。
    • 从replay buffer取出batch_size个transition。
    • 用target policy network求出下一个动作。
      • 与DQN with Target相同,都是用target网络进行动作选择,用target网络进行价值评估。
    • 标准化reward。
    • 在给定transition的情况下,取两个网络较小的值,并求出TD target。
    • 使用TD算法更新两个价值网络。
    • 使用梯度上升更新策略网络。
    • 软更新target network。
def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2):
    """ update all networks in TD3 """
    self.update_cnt += 1
    state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)

    reward = reward[:, np.newaxis]  # expand dim
    done = done[:, np.newaxis]

    new_next_action = self.target_policy_net.evaluate(
        next_state, eval_noise_scale=eval_noise_scale
    )  # clipped normal noise
    reward = reward_scale * (reward - np.mean(reward, axis=0)) / (
        np.std(reward, axis=0) + 1e-6
    )  # normalize with batch mean and std; plus a small number to prevent numerical problem

    # Training Q Function
    target_q_input = tf.concat([next_state, new_next_action], 1)  # the dim 0 is number of samples
    target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input))

    #TD target
    target_q_value = reward + (1 - done) * gamma * target_q_min  # if done==1, only reward
    q_input = tf.concat([state, action], 1)  # input of q_net

    with tf.GradientTape() as q1_tape:
        predicted_q_value1 = self.q_net1(q_input)
        q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value))
	q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights)
    self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights))

    with tf.GradientTape() as q2_tape:
        predicted_q_value2 = self.q_net2(q_input)
        q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value))
	q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights)
    self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights))

    # Training Policy Function
    if self.update_cnt % self.policy_target_update_interval == 0:
        with tf.GradientTape() as p_tape:
            # 更新actor的时候,我们不需要加上noise,这里是希望actor能够寻着最大值。加上noise并没有任何意义
            new_action = self.policy_net.evaluate(
                state, eval_noise_scale=0.0
            )  # no noise, deterministic policy gradients
            new_q_input = tf.concat([state, new_action], 1)
            # """ implementation 1 """
            # predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input))
            """ implementation 2 """
            predicted_new_q_value = self.q_net1(new_q_input)
            policy_loss = -tf.reduce_mean(predicted_new_q_value)
		p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
    	self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))

            # Soft update the target nets
        self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau)
        self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau)
        self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)
5.5.saveModel
def saveModel(self):
    path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
    if not os.path.exists(path):
        os.makedirs(path)
    extend_path = lambda s: os.path.join(path, s)
    tl.files.save_npz(self.q_net1.trainable_weights, extend_path('model_q_net1.npz'))
    tl.files.save_npz(self.q_net2.trainable_weights, extend_path('model_q_net2.npz'))
    tl.files.save_npz(self.target_q_net1.trainable_weights, extend_path('model_target_q_net1.npz'))
    tl.files.save_npz(self.target_q_net2.trainable_weights, extend_path('model_target_q_net2.npz'))
    tl.files.save_npz(self.policy_net.trainable_weights, extend_path('model_policy_net.npz'))
    tl.files.save_npz(self.target_policy_net.trainable_weights, extend_path('model_target_policy_net.npz'))
    print('Saved weights.')

5.6.loadModel
def loadModel(self):
    path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))
    if os.path.exists(path):
        print('Load DQN Network parametets ...')
        extend_path = lambda s: os.path.join(path, s)
        tl.files.load_and_assign_npz(extend_path('model_q_net1.npz'), self.q_net1)
        tl.files.load_and_assign_npz(extend_path('model_q_net2.npz'), self.q_net2)
        tl.files.load_and_assign_npz(extend_path('model_target_q_net1.npz'), self.target_q_net1)
        tl.files.load_and_assign_npz(extend_path('model_target_q_net2.npz'), self.target_q_net2)
        tl.files.load_and_assign_npz(extend_path('model_policy_net.npz'), self.policy_net)
        tl.files.load_and_assign_npz(extend_path('model_target_policy_net.npz'), self.target_policy_net)
        print('Load weights!')
    else: 
        print("No model file find, please train model first...")

6.主程序

6.1.帧画面转化为gif函数
def display_frames_as_gif(frames, path):
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=5)
    anim.save(path, writer='pillow', fps=30)
6.2.main函数
if __name__ == '__main__':
    # initialization of env
    env = gym.make(ENV_ID)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_range = env.action_space.high  # scale action, [-action_range, action_range]

    # reproducible
    env.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    tf.random.set_seed(RANDOM_SEED)

    # initialization of buffer
    replay_buffer = ReplayBuffer(args.replay_buffer_size)
    # initialization of trainer
    agent = TD3(
        state_dim, action_dim, action_range, args.hidden_dim, replay_buffer, args.delayed_update_itr, args.q_lr, args.policy_lr
    )
    t0 = time.time()

    # training loop
    agent.loadModel()
    if args.train:
        frame_idx = 0
        all_episode_reward = []

        # need an extra call here to make inside functions be able to use model.forward
        state = env.reset().astype(np.float32)
        agent.policy_net([state])
        agent.target_policy_net([state])

        for episode in range(args.train_episodes):
            state = env.reset().astype(np.float32)
            episode_reward = 0

            for step in range(args.max_steps):
                if args.render:
                    env.render()
                if frame_idx > args.explore_steps:
                    action = agent.policy_net.get_action(state, args.explore_noise_scale)
                else:
                    action = agent.policy_net.sample_action()

                next_state, reward, done, _ = env.step(action)
                next_state = next_state.astype(np.float32)
                done = 1 if done is True else 0

                replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
                frame_idx += 1

                if len(replay_buffer) > args.batch_size:
                    for i in range(args.update_itr):
                        agent.update(args.batch_size, args.eval_noise_scale, args.reward_scale, args.gamma, args.tau)
                if done:
                    break
            if episode == 0:
                all_episode_reward.append(episode_reward)
            else:
                all_episode_reward.append(all_episode_reward[-1] * 0.9 + episode_reward * 0.1)
            print(
                'Training  | Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
                    episode + 1, args.train_episodes, episode_reward,
                    time.time() - t0
                )
            )
            # 一百轮保存一遍模型
            if episode % 100 == 0:
                agent.saveModel()

        plt.plot(all_episode_reward)
        if not os.path.exists('image'):
            os.makedirs('image')
        plt.savefig(os.path.join('image', '_'.join([ALG_NAME, ENV_ID])))
    else:
        # need an extra call here to make inside functions be able to use model.forward
        state = env.reset().astype(np.float32)
        agent.policy_net([state])

        for episode in range(args.test_episodes):
            state = env.reset().astype(np.float32)
            episode_reward = 0
            frames = []
            for step in range(args.max_steps):
                env.render()
                frames.append(env.render(mode='rgb_array'))

                action = agent.policy_net.get_action(state, args.explore_noise_scale, greedy=True)
                state, reward, done, info = env.step(action)
                state = state.astype(np.float32)
                episode_reward += reward
                if done:
                    break
            print(
                'Testing  | Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
                    episode + 1, args.test_episodes, episode_reward,
                    time.time() - t0
                )
            )
            # 将本场游戏保存为gif
            if args.save_gif:
                dir_path = os.path.join('testVideo', '_'.join([ALG_NAME, ENV_ID]))
                if not os.path.exists(dir_path):
                    os.makedirs(dir_path)
                display_frames_as_gif(frames, dir_path + '\\' + str(episode) + ".gif")

    env.close()

训练结果

2000次

请添加图片描述
请添加图片描述


DQN with Target代码实现

猜你喜欢

转载自blog.csdn.net/weixin_40735291/article/details/120613594