继续解读t5代码之中源码的内容
回到t5的整体结构之中
回到t5模型的整体的结构之中
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 8)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
这里我们接着查看一下T5DenseReluDense模型的结构特点
class T5DenseReluDense(nn.Module):
def __init__(self, config):
super().__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states):
hidden_states = self.wi(hidden_states)
hidden_states = nn.functional.relu(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.wo(hidden_states)
return hidden_states
这里内容的参数之中,
config.d_model = 512
config.d_ff = 2048
这里还需要注意一下T5LayerNorm网络层结构的实现
先copy一下T5LayerNorm整个部分的源代码
class T5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Construct a layernorm module in the T5 style No bias and no subtraction of mean.
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
# layer norm should always be calculated in float32
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
# convert into float16 if necessary
if self.weight.dtype == torch.float16:
hidden_states = hidden_states.to(torch.float16)
return self.weight * hidden_states
常规的layernormalization的计算公式为
y = x − μ V a r ( x ) + ϵ y = \frac{x-\mu}{\sqrt{Var(x)+\epsilon}} y=Var(x)+ϵx−μ
这里的 μ \mu μ为期望, ϵ \epsilon ϵ为方差
而新更新的layernormalization的计算公式为
y = w e i g h t ∗ x V a r ( x ) + ϵ y = weight*\frac{x}{\sqrt{Var(x)+\epsilon}} y=weight∗Var(x)+ϵx
这里对应的weight参数初始化的时候全为1
self.weight = nn.Parameter(torch.ones(hidden_size))
比如对应的一个tensor的内容
tensor = torch.FloatTensor([[1,2,4,1],
[6,3,2,4],
[2,4,6,1]])
这里对最后一个维度的数值计算期望的方差的内容
E(x) = [2.0,3.75,3.25]
V(x) = [1.5,2.18,3.68]
然后对于当中的每个数值,计算
y = x − μ V a r ( x ) + ϵ y = \frac{x-\mu}{\sqrt{Var(x)+\epsilon}} y=Var(x)+ϵx−μ
这里就是改成
y = w e i g h t ∗ x V a r ( x ) + ϵ y = weight*\frac{x}{\sqrt{Var(x)+\epsilon}} y=weight∗Var(x)+ϵx
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
这里的x去除掉了减去 μ \mu μ期望的内容,最后一波直接乘上self.weight初始化为1的参数
另外需要注意的一点是,T5LayerFF调用了残差连接的内容

class T5LayerFF(nn.Module):
......
def forward(self,hidden_states):
forwarded_states = self.layer_norm(hidden_states)
forwarded_states = self.DenseReluDense(forwarded_states)
hidden_states = hidden_states + self.dropout(forwarded_states)
return hidden_states
T5CrossAttention网络层结构的调用
这里我们再返回查看一下T5decoder部分模型的整个的内容
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 8)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
首先放出T5LayerSelfAttention和T5LayerCrossAttention网络层结构的内容,这里目前暂时没有看出来有什么差异之处
(从上面来看,t5多了一个relative_attention_bias)
查看一下t5的Decoder部分,能看出来也是多了一个a_bias的内容
t5的Encoder和Decoder内容比对
t5的Encoder部分内容
x = self.apply(
inputs=[x, x, x, position_bias],
layer=MultiHeadAttention,
arguments={
'p_bias': 't5_relative'},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=attention_name
)
t5的Decoder部分的内容
x = self.apply(
inputs=[x, x, x, attention_mask, position_bias[0]],
layer=MultiHeadAttention,
arguments={
'a_bias': True,
'p_bias': 't5_relative'
},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=self_attention_name
)
可以看出来上面的t5decoder部分多了一个a_bias的内容
class T5LayerSelfAttention(nn.Module):
def __init__(self, config, has_relative_attention_bias=False):
super().__init__()
self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.SelfAttention(
normed_hidden_states,
mask=attention_mask,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
output_attentions=output_attentions,
)
hidden_states = hidden_states + self.dropout(attention_output[0])
outputs = (hidden_states,) + attention_output[1:] # add attentions if we output them
return outputs
class T5LayerCrossAttention(nn.Module):
def __init__(self, config):
super().__init__()
self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(
self,
hidden_states,
key_value_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
query_length=None,
output_attentions=False,
):
normed_hidden_states = self.layer_norm(hidden_states)
attention_output = self.EncDecAttention(
normed_hidden_states,
mask=attention_mask,
key_value_states=key_value_states,
position_bias=position_bias,
layer_head_mask=layer_head_mask,
past_key_value=past_key_value,
use_cache=use_cache,
query_length=query_length,
output_attentions=output_attentions,
)
layer_output = hidden_states + self.dropout(attention_output[0])
outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
return outputs
bert4keras调用t5模型之中的encoder部分和decoder部分
这里先查看一下bert4keras中t5模型encoder与decoder部分的调用
encoder部分调用attention
x = self.apply(
inputs=[x, x, x, position_bias],
layer=MultiHeadAttention,
arguments={
'p_bias': 't5_relative'},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=attention_name
)
decoder部分调用attention(分为两部分:内容1、内容2)
x = self.apply(
inputs=[x, x, x, attention_mask, position_bias[0]],
layer=MultiHeadAttention,
arguments={
'a_bias': True,
'p_bias': 't5_relative'
},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=self_attention_name
)
............
............
x = self.apply(
inputs=[x, c, c, position_bias[1]],
layer=MultiHeadAttention,
arguments={
'a_bias': None,
'p_bias': 't5_relative'
},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=cross_attention_name
)
从苏神的代码来看,上面的decoder部分的后一部分和encoder的中的attention网络结构内容相同
(transformer中的代码感觉全都相似。)
另外一个区别就是transformer每次encoder开始和decoder开始都会调用一个embedding网络层
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 8)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
将T5LayerCrossAttention部分改为T5LayerSelfAttention部分,发现了报错,终于看出来了两者的不同之处
仔细查看transformer之中T5LayerCrossAttention网络层和T5LayerSelfAttention网络层的不同之处
class T5LayerSelfAttention(nn.Module):
......
def forward(
self,
hidden_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
output_attentions=False,
):
class T5LayerCrossAttention(nn.Module):
......
def forward(
self,
hidden_states,
key_value_states,
attention_mask=None,
position_bias=None,
layer_head_mask=None,
past_key_value=None,
use_cache=False,
query_length=None,
output_attentions=False,
):
发现T5LayerCrossAttention比T5LayerSelfAttention多了两个参数key_value_states以及query_length
其他的区域部分保持一致
回到bert4keras中的模型
发现
x = self.apply(
inputs=[x, x, x, attention_mask, position_bias[0]],
layer=MultiHeadAttention,
arguments={
'a_bias': True,
'p_bias': 't5_relative'
},
heads=self.num_attention_heads,
head_size=self.attention_head_size,
out_dim=self.hidden_size,
key_size=self.attention_key_size,
use_bias=False,
attention_scale=False,
kernel_initializer=self.initializer,
name=self_attention_name
)
这里的a_bias本质上就是下三角的mask内容
class LM_Mask(object):
"""定义下三角Attention Mask(语言模型用)
"""
def compute_attention_bias(self, inputs=None):
"""通过idxs序列的比较来得到对应的mask
"""
if self.attention_bias is None:
def lm_mask(s):
seq_len = K.shape(s)[1]
idxs = K.arange(0, seq_len)
mask = idxs[None, :] <= idxs[:, None]
mask = K.cast(mask, K.floatx())
return -(1 - mask[None, None]) * 1e12
self.attention_bias = self.apply(
inputs=self.inputs[0],
layer=Lambda,
function=lm_mask,
name='Attention-LM-Mask'
)
return self.attention_bias
如果a_bias == True的情况下,这里先加上LM_Mask(下三角掩码),再加上相对位置编码,否则直接加上相对位置编码