samout v1 预训练模型发布

数据集使用 minimind 数据集

训练代码

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from glob import glob
from tqdm import tqdm
from model import SamOut

import polars as pl
from collections import Counter


def train():


    voc = pd.read_pickle("total_voc.pkl")

    net = SamOut(len(voc["voc"]), 512, 32, 8)

    net.load_state_dict(torch.load("pretrain.pth"))
    net.to("cuda")

    opt = torch.optim.Adam(params=net.parameters(), lr=0.00003)
    loss_func0 = torch.nn.CrossEntropyLoss(ignore_index=3)

    bar = tqdm(range(20))
    steps = 0
    epoch_loss = []

    for epoch in bar:
        for one_path in tqdm(glob("./pre_data_set_*.pkl")):
            data_set = pd.read_pickle(one_path)
            np.random.shuffle(data_set)
            loss_list = []
            for i in range(0, len(data_set), 100):
                # weights.append(list(net.state_dict().values())[0])
                j = i + 100
                input_one = data_set[i:j]

                out0, _ = net(torch.Tensor(input_one)[:, :-1].int().to("cuda"))
                loss = loss_func0(out0.reshape([-1, out0.shape[-1]]),
                                  torch.Tensor(input_one)[:, 1:].reshape([-1]).long().to("cuda"))

                loss_list.append(loss.item())
                bar.set_description("epoch___{}____loss___{:.6f}____steps___{}".format(epoch, np.mean(loss_list), steps))
                opt.zero_grad()
                loss.backward()
                opt.step()
                steps += 100

            torch.save(net.state_dict(), "pretrain.pth")
            # eval_model()
            epoch_loss.append(np.mean(loss_list))
            pd.to_pickle(epoch_loss, "loss916")





def gen_one_voc():
    data = pd.read_csv("pretrain_data.csv")

    data = data["text"].values.tolist()
    data = "".join(data)
    count = Counter()
    for ii in tqdm(range(0, len(data), len(data) // 8)):
        jj = ii + len(data) // 8
        for k, v in Counter(data[ii:jj]).items():
            count[k] = count.get(k, 0) + v

    data = ""
    data0 = pd.read_csv("sft_data_multi.csv")
    for ii in tqdm(range(0, len(data0), len(data0) // 8)):
        jj = ii + len(data0) // 8
        for k, v in Counter(data0[ii:jj]).items():
            count[k] = count.get(k, 0) + v
    data0 = ""
    data1 = pd.read_csv("sft_data_single.csv")
    for ii in tqdm(range(0, len(data1), len(data1) // 8)):
        jj = ii + len(data1) // 8
        for k, v in Counter(data1[ii:jj]).items():
            count[k] = count.get(k, 0) + v
    data1 = ""

    # plt.plot(sorted(count.values()))
    # plt.show()
    count = pd.DataFrame({
    
    "voc": count.keys(), "count": count.values()})
    voc = count.loc[count["count"] > 100, "voc"].values.tolist()
    voc0 = [[[["<|pos_{}_{}|>".format(jj, ii) for jj, ii in enumerate(list(str(i)))], j] for i, j in
             enumerate(count.loc[count["count"] <= 100, "voc"].values.tolist())]]
    pd.to_pickle(voc, "voc.pkl")
    pd.to_pickle(voc0, "voc0.pkl")


def gen_voc():
    voc = pd.read_pickle("voc.pkl")
    voc0 = pd.read_pickle("voc0.pkl")
    voc0 = {
    
    j: i for i, j in voc0[0]}
    for i in range(6):
        for j in range(10):
            voc.append("<|pos_{}_{}|>".format(i, j))
    voc = ["<|sos|>", "<|user|>", "<|agent|>", "<|pad|>", "<|history|>"] + sorted(voc)

    pd.to_pickle({
    
    "voc": voc, "voc0": voc0}, "total_voc.pkl")


def gen_pre_data_align(num, total_num):
    voc = pd.read_pickle("total_voc.pkl")
    voc["voc0"] = [[i,[voc["voc"].index(j) for j in ii]] for i,ii in voc["voc0"].items()]
    voc["voc"]=[i for i in voc["voc"]]
    voc={
    
    "voc": voc["voc"] + [i for i, j in voc["voc0"]],
                  "voc_id": [[i] for i in list(range(len(voc["voc"])))] + [j for i, j in voc["voc0"]]}
    voc=pd.DataFrame(voc)
    # voc=pl.DataFrame(voc)

    pre_data = pl.read_csv("pretrain_data.csv")
    pre_data = pre_data["text"].to_numpy().tolist()
    count = len(pre_data) // total_num
    pre_data = pre_data[(num - 1) * count:count * num]
    data_set = []
    bar = tqdm(range(len(pre_data)))

    while pre_data:
        bar.update()
        one = pre_data.pop()
        one = pd.merge(pd.DataFrame({
    
    "voc": list(one)}),voc, on="voc", how="left")

        thr =np.hstack(one["voc_id"].to_numpy()).tolist()

        thr += (518 - len(thr)) * [3]
        thr = thr[:512]
        data_set.append(thr)
    pd.to_pickle(data_set, "pre_data_set_{}.pkl".format(num))


if __name__ == '__main__':
    # gen_one_voc()
    # gen_voc()
    # for i in range(17,18):
    #     gen_pre_data_align(i, 16)

    train()


这段代码是一个深度学习项目的训练部分,主要目的是训练一个名为 SamOut 的神经网络模型。以下是代码的主要组成部分和功能:

  1. 导入必要的库
    • matplotlib.pyplotnumpypandastorchglobtqdmpolarsCounter 等库被导入,用于数据处理、模型训练和可视化。
  2. 定义 train 函数
    • 加载词汇表 total_voc.pkl
    • 初始化 SamOut 模型,并加载预训练权重 pretrain.pth
    • 将模型转移到 GPU 上。
    • 定义优化器 Adam 和损失函数 CrossEntropyLoss
    • 使用 tqdm 进度条进行训练,训练 20 个 epoch。
    • 在每个 epoch 中,遍历数据集,计算损失,更新模型权重。
    • 每个 epoch 结束后,保存模型权重和损失记录。
  3. 定义 gen_one_voc 函数
    • 从 CSV 文件中读取文本数据,计算每个字符的出现频率。
    • 根据频率筛选出出现次数大于 100 的字符作为词汇表。
    • 保存词汇表到 voc.pkl
  4. 定义 gen_voc 函数
    • 加载词汇表 voc.pklvoc0.pkl
    • 将词汇表扩展,包括特殊标记 <|sos|>, <|eos|>, <|agent|>, <|pad|>, <|history|>
    • 保存扩展后的词汇表到 total_voc.pkl
  5. 定义 gen_pre_data_align 函数
    • 加载扩展后的词汇表 total_voc.pkl
    • 读取 CSV 文件中的文本数据,将其转换为词汇表的索引表示。
    • 将数据集分割成多个部分,每个部分保存为一个 pickle 文件。
  6. 主函数
    • 调用 gen_one_vocgen_vocgen_pre_data_align 函数生成数据集。
    • 调用 train 函数进行模型训练。
      总的来说,这段代码的目的是训练一个基于字符的神经网络模型,用于处理文本数据。
import torch


class MaxState(torch.nn.Module):
    def __init__(self, hidden_dim, heads, win):
        super(MaxState, self).__init__()

        assert hidden_dim % heads == 0, "Hidden size must be divisible by the number of heads."

        self.head_size = hidden_dim // heads
        self.head0 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.head1 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.head2 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        # self.h_linear=torch.nn.Parameter(torch.empty(1, 1))
        # torch.nn.init.xavier_uniform_(self.h_linear,0.5)
        # self.layer_nor = torch.nn.LayerNorm(hidden_dim)
        # self.norm = torch.nn.LayerNorm(hidden_dim)
        # self.alpha = torch.nn.Parameter(torch.tensor(0.5))

        self.head_num = heads

        self.hidden = hidden_dim

    def forward(self, input_data, state=None):
        # self.head.to(device)
        b, s, k, h = input_data.shape[0], input_data.shape[1], self.head_num, self.head_size

        out = self.head0(input_data)

        out1 = self.head1(input_data)

        out2 = self.head2(input_data)

        #
        out = out.reshape([b, s, k, h]).permute([0, 2, 1, 3])
        out1 = out1.reshape([b, s, k, h]).permute([0, 2, 1, 3])
        # out2 = out2.reshape([b, s, k, h]).permute([0, 2, 1, 3])
        # out1 = self.head1(input_data).reshape([b, s, k, h]).permute([0, 2, 1, 3])

        out = torch.cummax((out + out1) / h ** 0.5, 2)[0]
        # out = torch.cummin((out + out1)/k**0.5 , 2)[0]
        # out_sum = torch.cumsum((out + out1)/k**0.5 , 2)
        # out=(out-out_min)*out

        out = out.permute([0, 2, 1, 3])
        out1 = out1.permute([0, 2, 1, 3])
        # out2 = out2.permute([0, 2, 1, 3])
        out = out.reshape([b, s, -1])
        out1 = out1.reshape([b, s, -1])
        # out2 = out2.reshape([b, s, -1])
        # out = self.layer_nor(out)

        # out = (out + out2) * out+out1

        # out3=torch.cummax(out,1)[0]
        out = (out + out2) * out + out1

        # out = self.alpha * out * (out + out2) + (1 - self.alpha) * out1

        return out, state


class KAttention(torch.nn.Module):
    def __init__(self, hidden_dim, heads):
        super(KAttention, self).__init__()

        assert hidden_dim % heads == 0, "Hidden size must be divisible by the number of heads."

        self.head_size = hidden_dim // heads
        self.q = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.k = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.v = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        # self.state = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.head_num = heads

    def forward(self, x, state=None):
        b, s, h, d = x.shape[0], x.shape[1], self.head_num, self.head_size
        q = self.q(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])
        k = self.k(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])
        v = self.v(x).reshape([b, s, h, d]).permute([0, 2, 1, 3])
        qk = (q @ k.permute([0, 1, 3, 2])) / d ** 0.5
        mask = torch.triu(torch.ones(s, s).to(device))
        qk = torch.where(mask.T == 1, qk, torch.Tensor([-float('inf')]).to(device))
        qkv = torch.nn.functional.softmax(qk, -1) @ v
        #             v + torch.arange(1, 3 * s, 3).reshape([1, 1, -1, 1]).to(device) / s / 3)
        qkv = qkv.permute([0, 2, 1, 3]).reshape([b, s, -1])
        #
        return qkv, state


class FeedForward(torch.nn.Module):
    def __init__(self, hidden_size):
        super(FeedForward, self).__init__()

        self.ffn1 = torch.nn.Linear(hidden_size, hidden_size * 2)
        self.ffn2 = torch.nn.Linear(hidden_size * 2, hidden_size)
        self.gate = torch.nn.Linear(hidden_size, hidden_size * 2)
        # self.h_linear=torch.nn.Parameter(torch.empty(1, 1))
        # self.gate  = torch.nn.Parameter(torch.empty(hidden_size,  hidden_size * 2))
        # torch.nn.init.xavier_uniform_(self.gate,0.5)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x1 = self.ffn1(x)
        x2 = self.relu(self.gate(x))
        xx = x1 * x2
        x = self.ffn2(xx)
        return x


class DecoderLayer(torch.nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(DecoderLayer, self).__init__()
        # self.self_attention = MaskMultiHeadAttention(hidden_size, num_heads)
        self.self_attention = MaxState(hidden_size, num_heads, 8)
        # self.self_attention = KAttention(hidden_size, num_heads)
        self.ffn = FeedForward(hidden_size)
        self.layer_norm = torch.nn.LayerNorm(hidden_size)
        # self.norm = L2Norm()

        # self.layer_nor = torch.nn.LayerNorm(hidden_dim)
        # self.norm = torch.nn.LayerNorm(hidden_dim)
        self.alpha = torch.nn.Parameter(torch.tensor(0.5))

        # ha = self.norm(self.attention(h))
        # # 更新输入,包括缩放后的注意力输出
        # h = self.norm(h + self.attention_scale * (ha - h))
        # # 对更新后的输入进行多层感知机层的处理并归一化
        # hm = self.norm(self.mlp(h))
        # # 最终更新输入,包括缩放后的多层感知机输出
        # h = self.norm(h + self.mlp_scale * (hm - h))
        # 返回处理后的结果

    def forward(self, x, state=None, seq_len=None):
        x1, state = self.self_attention(x, state)
        x = self.layer_norm(self.alpha*self.ffn(x1) + (1-self.alpha)*x)

        return x, state


class SamOut(torch.nn.Module):
    def __init__(self, voc_size, hidden_size, num_heads, num_layers):
        super(SamOut, self).__init__()
        self.em = torch.nn.Embedding(voc_size, hidden_size, padding_idx=3)
        self.pos = torch.nn.Embedding(1024, hidden_size)

        self.decoder_layers = torch.nn.ModuleList([DecoderLayer(hidden_size, num_heads) for _ in range(num_layers)])
        self.head = torch.nn.Linear(hidden_size, voc_size, False)
        # self.head_state = torch.nn.Linear(hidden_size, num_layers, False)

        self.down = torch.nn.ModuleList(
            [torch.nn.Linear(2 * hidden_size, hidden_size, False) for _ in range(num_layers)])

        # self.down = torch.nn.Linear(2 * hidden_size, hidden_size, False)


    def state_forward(self, state, pos, x):
        if state is None:
            state = [None] * len(self.decoder_layers)
        i = 0
        for ii, decoder_layer in enumerate(self.decoder_layers):
            x = self.down[i](torch.concat([torch.zeros([x.shape[0], 1, 1]).to(device) + pos, x], -1))
            # x = self.down[i](torch.concat([x2, x], -1))
            # x = self.down(torch.concat([torch.zeros([x.shape[0], 1, 1]).to(device) + pos, x], -1))

            x1, state[i] = decoder_layer(x, state[i])
            x = x1 + x
            i += 1
        return x, state

    def pos_forward(self, x):
        if x.shape[1] >= 1024:
            pos = self.pos(torch.arange(0, x.shape[1]).long().to(device) // 1024).unsqueeze(0)
            pos = self.pos(torch.arange(0, x.shape[1]).long().to(device) % 1024).unsqueeze(0) + pos

        else:
            pos = self.pos(torch.arange(0, x.shape[1]).long().to(device)).unsqueeze(0)
        return pos

    def forward(self, x0):
        x0, _ = self.one_forward(x0, state=None)

        return x0, _

    def one_forward(self, x, state=None, seq_len=None):

        x = self.em(x)


        pos = self.pos_forward(x)

        x, state = self.state_forward(state, pos, x)

        return self.head(x), state


device = "cuda"
if __name__ == '__main__':
    net = SamOut(235, 256, 16, 4)
    net.to(device)
    net(torch.randint(0, 200, [2, 8 * 13]).to(device))
    #

猜你喜欢

转载自blog.csdn.net/weixin_32759777/article/details/143528553