# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @version: v1.0
# @Author : Meng Li
# @contact: [email protected]
# @FILE : Torch_bert.py
# @Time : 2022/7/7 14:32
# @Software : PyCharm
# @site:
# @Description : 自己实现的Bert模型
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import re
import random
import numpy as np
import math
text = (
'Hello, how are you? I am Romeo.\n' # R
'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
'Nice meet you too. How are you today?\n' # R
'Great. My baseball team won the competition.\n' # J
'Oh Congratulations, Juliet\n' # R
'Thank you Romeo\n' # J
'Where are you going today?\n' # R
'I am going shopping. What about you?\n' # J
'I am going to visit my grandmother. she is not very well' # R
)
sentence = re.sub("[,.!?\\-]", "", text.lower()).split("\n") # 去除字符串中的".,!?-"
vocab = " ".join([i for i in sentence])
vocab = list(set([i for i in vocab.split(" ")]))
word2idx = {'MASK': 0, 'CLS': 1, 'SEQ': 2, 'PAD': 3}
for i in range(len(vocab)):
word2idx[vocab[i]] = i + 4
idx2word = {i: j for i, j in enumerate(word2idx)}
vocab_size = len(idx2word)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
token_list = []
for i in range(len(sentence)):
token_list.append([word2idx[j] for j in sentence[i].split(" ")])
max_len = 30 # 句子最长长度
num_pred = 5 # 最长掩码长度
batch_size = 6 # batch大小
n_layers = 6
embedding_size = 768 # 向量的embed维度
segments_len = 2
embed_size = 768
dim = 64
num_heads = 12
d_ff = 64
dropout = 0.5
class my_dataset(Dataset):
def __init__(self, input_ids, segment_ids, masked_pos, masked_tokens, isNext):
super().__init__()
self.input_ids = input_ids
self.segment_ids = segment_ids
self.masked_pos = masked_pos
self.masked_tokens = masked_tokens
self.isNext = isNext
def __getitem__(self, index):
return self.input_ids[index], self.segment_ids[index], self.masked_pos[index], self.masked_tokens[index], \
self.isNext[index]
def __len__(self):
return self.input_ids.size(0)
def make_data(seq_data):
"""
:param seq_data:
:return: 返回 [input_ids, segment_ids, masked_tokens, masked_pos, isNext]
"""
batch = []
left_cnt = right_cnt = 0
while left_cnt <= batch_size / 2 or right_cnt <= batch_size / 2:
rand_a_idx = rand_b_idx = 0
sen_a_idx = random.randrange(len(seq_data))
sen_b_idx = random.randrange(len(seq_data))
tokens_a = seq_data[sen_a_idx]
tokens_b = seq_data[sen_b_idx]
# int型 和 list 相加可以采用这种方式
input_ids = [word2idx['CLS']] + tokens_a + [word2idx['SEQ']] + tokens_b + [word2idx['SEQ']]
segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
n_pred = min(num_pred, int(len(input_ids) * 0.15)) # 将一句话中15%的词替换成其他内容
test_input_ids = [i for i, j in enumerate(input_ids) if idx2word[j] != 'CLS' and idx2word[j] != 'SEQ']
random.shuffle(test_input_ids) # 将input_ids随机打乱
masked_tokens, masked_pos = [], []
for word in range(n_pred):
cand_rep_idx = test_input_ids[word]
masked_pos.append(cand_rep_idx)
masked_tokens.append(input_ids[cand_rep_idx])
p = random.random()
if p > 0.8:
input_ids[cand_rep_idx] = word2idx['MASK']
elif p > 0.1:
other_idx = random.randrange(len(input_ids))
input_ids[cand_rep_idx] = input_ids[other_idx]
else:
input_ids[cand_rep_idx] = input_ids[word]
n_pad = max_len - len(input_ids)
input_ids.extend(n_pad * [0])
segment_ids.extend(n_pad * [0])
if num_pred > n_pred:
n_pad = num_pred - n_pred
masked_pos.extend(n_pad * [0])
masked_tokens.extend(n_pad * [0])
if sen_a_idx + 1 != sen_b_idx and left_cnt <= batch_size / 2:
isNext = False
left_cnt = left_cnt + 1
batch.append([input_ids, segment_ids, masked_pos, masked_tokens, isNext])
elif sen_a_idx + 1 == sen_b_idx and right_cnt <= batch_size / 2:
isNext = True
right_cnt = right_cnt + 1
batch.append([input_ids, segment_ids, masked_pos, masked_tokens, isNext])
return batch
class ScaledDotProductAttention(nn.Module):
def __init__(self):
super(ScaledDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
"""
Q: [batch_size, n_heads, len_q, d_k]
K: [batch_size, n_heads, len_k, d_k]
V: [batch_size, n_heads, len_v(=len_k), d_v]
attn_mask: [batch_size, n_heads, seq_len, seq_len]
这里求取注意力相似度采用的是可缩放点积,普通的点乘,方差会很大,反向传播时梯度会变小
"""
scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(dim) # scores : [batch_size, n_heads, len_q, len_k]
# 将矩阵scores中attn_mask为True时对应的元素索引置-1e9
# scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is True.
scores.masked_fill_(attn_mask, 0) # Fills elements of self tensor with value where mask is True.
# scores = torch.matmul(attn_mask.float(), scores)
attn = nn.Softmax(dim=-1)(scores) # [batch_size, n_heads, len_q]
# atten_mask 与 V 相乘得到经过Masked后的注意力矩阵
context = torch.matmul(attn, V) # [batch_size, n_heads, len_q, d_v]
return context
class Multi_Head_Attention(nn.Module):
def __init__(self):
super().__init__()
self.W_Q = nn.Linear(embed_size, dim * num_heads, bias=False) # 将输入矩阵映射为低维度
self.W_K = nn.Linear(embed_size, dim * num_heads, bias=False) # 将输入矩阵映射为低维度
self.W_V = nn.Linear(embed_size, dim * num_heads, bias=False) # 将输入矩阵映射为低维度
self.projection = torch.nn.Linear(num_heads * dim, embed_size) # 将atten的维度转换为与输入的维度一致
def forward(self, input_Q, input_K, input_V, atten_mask):
"""
:param input_Q: -> [Batch_size, len_q, embedding_size]
:param input_K: -> [Batch_size, len_k, embedding_size]
:param input_V: -> [Batch_size, len_v(=len_k), embedding_size]
:param atten_mask: -> [Batch_size, atten_len_k, atten_len_v]
:return: 这里的dim是QKV矩阵的维度
# 对输入求得Q、K、V三个矩阵,然后根据Q和K矩阵求得注意力矩阵,最后根据注意力矩阵求得经过Masked后的注意力矩阵
# 返回的enc_inputs 和 atten 张量维度的一样的
"""
torch.backends.cudnn.enabled = False
residual = input_Q # [Batch_size, len_q, embedding_size] 这里是残差项,多层注意力的输出与此项相加
_, len_q, embedding_size = input_Q.size()
_, len_k, _ = input_K.size()
Batch_size, atten_len_k, atten_len_v = atten_mask.size()
# 输入乘以矩阵得到Q、K、V矩阵
Q = self.W_Q(input_Q).view(Batch_size, num_heads, len_q, dim) # Q -> [Batch_size, len_q, dim*num_heads]
K = self.W_K(input_K).view(Batch_size, num_heads, len_k, dim) # K -> [Batch_size, len_k, dim*num_heads]
V = self.W_V(input_V).view(Batch_size, num_heads, len_k, dim) # V -> [Batch_size, len_v, dim*num_heads]
atten_mask = atten_mask.unsqueeze(1) # atten_mask -> [Batch_size, 1, atten_len_k, atten_len_v]
# atten_mask -> [Batch_size, num_heads, atten_len_k, atten_len_v] 这里的 atten_len_v == len_q
atten_mask = atten_mask.repeat(1, num_heads, 1, 1)
atten = ScaledDotProductAttention()(Q, K, V, atten_mask)
atten = atten.transpose(1, 2) # atten -> [Batch_size, atten_len_k, num_heads, dim]
atten = atten.reshape(Batch_size, atten_len_k, -1) # atten -> [Batch_size, atten_len_k, num_heads * dim]
atten = self.projection(atten).to(device) # atten -> [Batch_size, atten_len_k, embed_size] atten_len_k == len_q
# softmax 不改变矩阵的维度,这里对行方向对行向量进行归一化 这里对输出和残差 进行Add && Norm 操作
atten_ret = (residual + torch.softmax(atten, dim=1))
atten_ret = nn.LayerNorm(embed_size).to(device)(atten_ret)
return atten_ret
class Feed_forward(nn.Module):
"""
对应于原论文中的Feed-Forward流程
查看某个数据是否存储于cuda上,可键入命令: x.is_cuda 其中x为变量
"""
def __init__(self):
super().__init__()
self.W1 = nn.Linear(embed_size, d_ff).to(device)
self.W2 = nn.Linear(d_ff, embed_size).to(device)
self.b1 = torch.rand(d_ff).to(device)
self.b2 = torch.rand(embed_size).to(device)
self.relu = nn.ReLU().to(device)
self.dropout = nn.Dropout(p=dropout)
def forward(self, enc_inputs):
"""
:param enc_inputs: # enc_inputs -> [Batch_size, seq_len, embedding_size]
# atten -> [Batch_size, seq_len, embedding_size]
:return:
"""
fc1 = self.W1(enc_inputs) + self.b1
fc1 = self.relu(fc1)
fc2 = self.W2(fc1) + self.b2 # fc2 -> [Batch_size, seq_len, embedding_size]
output = fc2 # output -> [Batch_size, seq_len, embedding_size]
residual = enc_inputs
Add_And_Norm = nn.LayerNorm(embed_size).cuda()(output + residual)
return Add_And_Norm
class Encoder_layer(nn.Module):
def __init__(self):
super().__init__()
self.multi_head_attention = Multi_Head_Attention()
self.feed_forward = Feed_forward()
def forward(self, enc_inputs, enc_atten_mask):
"""
:param enc_inputs: # enc_inputs -> [Batch_size, src_len, embedding_size]
:param enc_atten_mask: # enc_atten_mask -> [Batch_size, src_len, src_len]
:return:
"""
# 传入多层注意力机制的输入Q、K、V 都假定为一样的
atten_output = self.multi_head_attention(enc_inputs, enc_inputs, enc_inputs, enc_atten_mask) # 这里得到的是注意力矩阵
output = self.feed_forward(atten_output).to(device) # output -> [Batch_size, seq_len, embeded_size]
return output, atten_output
def get_attn_pad_mask(seq_q, seq_k):
"""
:param seq_q: seq_q -> [Batch_size, len_q]
:param seq_k: seq_k -> [Batch_size, len_k]
:return:
"""
Batch_size, len_q = seq_q.size()
Batch_size, len_k = seq_k.size()
atten_mask = seq_k.eq(0).unsqueeze(1) # atten_mask -> [Batch_size, 1, len_k]
atten_mask = atten_mask.expand(Batch_size, len_q, len_k) # atten_mask -> [Batch_size, len_q, len_k]
return atten_mask
def gelu(x):
"""
Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
论文中用GELU代替了RELU
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
class BERT(nn.Module):
def __init__(self):
super().__init__()
self.token_embed = torch.nn.Embedding(vocab_size, embedding_size).to(device) # 对token进行向量化
self.pos_embed = torch.nn.Embedding(max_len, embedding_size).to(device) # 对position进行向量化
self.seg_embed = torch.nn.Embedding(segments_len, embedding_size).to(device) # 对不同的语句进行向量化
self.layers = nn.ModuleList(Encoder_layer() for _ in range(n_layers))
self.fc1 = nn.Sequential(
nn.Linear(embedding_size, embedding_size),
nn.Dropout(0.5),
nn.Tanh(),
)
self.classifier = nn.Linear(embedding_size, 2) # 对两句话进行分类,isNext or not is Next
self.fc2 = nn.Linear(embedding_size, vocab_size)
self.linear = nn.Linear(embedding_size, embedding_size)
def forward(self, input_token, segments_, masked_pos):
"""
:param masked_pos: [Batch_size, n_pred]
:param input_token: [Batch_size, seq_len]
:param segments_: [Batch_size, seq_len]
:return:
"""
Batch_size, seq_len = input_token.size()
pos = torch.arange(seq_len, dtype=torch.long) # [seq_len]
pos = pos.unsqueeze(0) # [1, seq_len]
pos = pos.repeat(Batch_size, 1).to(device) # [Batch_size, seq_len]
# input_token_embed -> [Batch_size, seq_len, embedding_size]
input_token_embed = self.token_embed(input_token) + self.seg_embed(segments_) + self.pos_embed(pos)
enc_atten_mask = get_attn_pad_mask(input_token, input_token) # [Batch_size, seq_len, seq_len]
output = input_token_embed
for layer in self.layers:
output, _ = layer(output, enc_atten_mask) # output [Batch_size, seq_len, embedding_size]
_, seq_len, _ = output.size()
nsp_output = output
nsp_output = self.fc1(nsp_output) # [Batch_size, seq_len, embedding_size]
nsp_output = self.classifier(nsp_output) # [Batch_size, seq_len, 2]
nsp_output = torch.sum(nsp_output.transpose(2, 1), dim=-1) # [Batch_size, 2]
nsp_output = torch.softmax(nsp_output, dim=-1) # [Batch_size, 2]
masked_pos = masked_pos.unsqueeze(-1).repeat(1, 1, embedding_size) # [Batch_size, n_pred, embedding_size]
nlm_output = torch.gather(output, 1, masked_pos)
nlm_output = gelu(self.linear(nlm_output)) # [Batch_size, n_pred, embedding_size]
nlm_output = self.fc2(nlm_output) # [Batch_size, n_pred, vocab_size]
return nsp_output, nlm_output
def train():
batch = make_data(token_list)
input_ids, segment_ids, masked_pos, masked_tokens, isNext = zip(*batch)
input_ids, segment_ids, masked_pos, masked_tokens, isNext = torch.LongTensor(input_ids), torch.LongTensor(
segment_ids), torch.LongTensor(masked_pos), torch.LongTensor(masked_tokens), torch.LongTensor(isNext)
train_data = my_dataset(input_ids, segment_ids, masked_pos, masked_tokens, isNext)
train_iter = DataLoader(train_data, batch_size, shuffle=True)
crition = torch.nn.CrossEntropyLoss() # 括号里面的参数顺序应该是(label, target)
model = BERT().train()
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
for step in range(1000):
for input_ids_i, segment_ids_i, masked_pos_i, masked_tokens_i, isNext_i in train_iter:
input_ids_i, segment_ids_i, masked_pos_i, masked_tokens_i, isNext_i = input_ids_i.to(device), \
segment_ids_i.to(device), \
masked_pos_i.to(device), \
masked_tokens_i.to(device), \
isNext_i.to(device)
optimizer.zero_grad()
nsp_out, nlm_out = model(input_ids_i, segment_ids_i, masked_pos_i)
classify_loss = crition(nsp_out, isNext_i)
masked_tokens_i = masked_tokens_i.view(-1)
nlm_out = nlm_out.view(-1, vocab_size)
nlm_out = nlm_out
nlm_loss = 0
nlm_loss = crition(nlm_out, masked_tokens_i)
nlm_loss = nlm_loss.mean()
loss = nlm_loss + classify_loss
loss.backward()
optimizer.step()
if step % 100 == 0:
print("step {0} loss {1} loss {2}".format(step, nlm_loss, classify_loss))
if __name__ == '__main__':
train()
目前NLP系列最后一道坎,Bert模型
Bert采用了Transformers模型中的Encoder模型,这里有6层Encoder。每层Encoder有12层多层注意力层(Multi-Head Attention)。
其中,输入语料库为有先后顺序的9句话。我这里将这9句话自制作为一个小型的数据集。
随机在语料库中选取两句话,如果这两句话有在整个文档中有先后顺序,那么isNext 字段为True
对于选取的两句话中的每一句话,随机Mask其中的某几个token,token就是字符级别的。
masked的方式有三种,80%的可能性将其变成'[MASKED]',10% 的可能性将其变成其他的token,还剩下10%的可能性将其保持不变。
这样做的好处就是,训练模型能够根据上下文预测当前被Masked的token,灵盖是由Word2Vec的EBOW激发而来。因为EBOW也是由一个词周边的词语预测当前词。
至于要将剩下的10%的token 不进行任何处理,是因为下游任务中是没有训练集,也就是没有'MASKED',为了让模型能够适配下游任务,这里做了相应的处理。
if p > 0.8:
input_ids[cand_rep_idx] = word2idx['MASK']
elif p > 0.1:
other_idx = random.randrange(len(input_ids))
input_ids[cand_rep_idx] = input_ids[other_idx]
else:
input_ids[cand_rep_idx] = input_ids[word]
训练集中的每一条语句有三部分相加而成。input_ids, segment_ids, isNext
input_ids的构造方式上文讲了,segment_ids是为了区分两句话中,之间的分割
eg:segment_ids [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
将[input_ids , segment_ids, isNext] 字段经过一个Embedding层,得到一个大小为[Batch_size, seq_len, embedding_size]大小的张量。这里的embeding层中对pos位置的处理方法相对于Transformer有点不同,这里采用的是一个Embedding的矩阵来表示位置,该矩阵是可以学习的。
pos = torch.arange(seq_len, dtype=torch.long) # [seq_len]
pos = pos.unsqueeze(0) # [1, seq_len]
pos = pos.repeat(Batch_size, 1).to(device) # [Batch_size, seq_len]
张量经过num_layers 个Encoder层,输出接两个全连接层,网络设计完毕
模型的loss 分两个,一个是MLM的loss 一个是NSP的loss
MLM的loss 仅仅针对被masked的token的loss(nlm_loss),NSP的loss是对句向量进行分类的loss(nsp_loss)。但是由于nsp_loss 太小, nlm_loss 对整个loss的影响太大,nsp_loss下降不是很明显