代码实现TextCNN分类

调包:

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'#mac系统防止出错
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm ##进度可视化

读数据:

def read_data(train_or_test, num=None):
    '''
    :param train_or_test: 指定数据集用于训练还是测试
    :param num: 使用的数据数量
    :return: 文本及对应分类
    '''
    with open(os.path.join("data", train_or_test + ".txt"), encoding="utf-8") as f:
        all_data = f.read().split("\n") #split返回列表,以\n来分割

    texts = []
    labels = []
    for data in all_data:
        if data:
            text, label = data.split("\t")
            texts.append(text)
            labels.append(label)
    if num == None:
        return texts, labels
    else:
        return texts[:num], labels[:num]

构建字库,并且返回embedding:

def built_curpus(train_texts, embedding_num):
    word_2_index = {"<PAD>": 0, "<UNK>": 1}
    for text in train_texts:
        for word in text:
            word_2_index[word] = word_2_index.get(word, len(word_2_index))
    return word_2_index, nn.Embedding(len(word_2_index), embedding_num)

Dataset:

class TextDataset(Dataset):
    def __init__(self, all_text, all_label, word_2_index, max_len):
        self.all_text = all_text
        self.all_label = all_label
        self.word_2_index = word_2_index
        self.max_len = max_len

    def __getitem__(self, index):
        text = self.all_text[index][:self.max_len]
        label = int(self.all_label[index])

        text_idx = [self.word_2_index.get(i, 1) for i in text]
        text_idx = text_idx + [0] * (self.max_len - len(text_idx))#填充<PAD>
        text_idx = torch.tensor(text_idx).unsqueeze(dim=0)#列表变tensor 最终tensor的维度为1 * max_len

        return text_idx, label
    
    def __len__(self):
        return len(self.all_text)

构造CNN block:

class Block(nn.Module):
    def __init__(self, kernel_s, embeddin_num, max_len, hidden_num):
        '''
        :param kernel_s: 卷积核的大小
        :param embeddin_num:
        :param max_len:
        :param hidden_num: 输出通道数
        '''
        super().__init__()
        self.cnn = nn.Conv2d(in_channels=1, out_channels=hidden_num, kernel_size=(kernel_s, embeddin_num)) #  1 * 1 * 7 * 5 [batch_size, input_channels, seq_len, embeddin_num]
        self.act = nn.ReLU()
        self.mxp = nn.MaxPool1d(kernel_size=(max_len-kernel_s+1))

    def forward(self, batch_emb):# 1 * 1 * 7 * 5 [batch_size, input_channels, seq_len, embeddin_num]
        c = self.cnn.forward(batch_emb)#卷积操作,[batch_size, input_channels, seq_len, embeddin_num]->[batch_size, output_channels, seq_len - kernel_s + 1, 1]
        a = self.act.forward(c)#激活函数,[batch_size, output_channels, seq_len - kernel_s + 1, 1]
        a = a.squeeze(dim=-1)#去除最后一个维度,[batch_size, output_channels, seq_len - kernel_s + 1]
        m = self.mxp.forward(a)#最大池化,[batch_size, output_channels, 1]
        m = m.squeeze(dim=-1)#去除最后一个维度,[batch_size, output_channels]
        return m#[batch_size, output_channels]

构建TextCNN:

class TextCNNModel(nn.Module):
    def __init__(self, emb_matrix, max_len, class_num, hidden_num):
        super().__init__()

        self.emb_matrix = emb_matrix
        self.embeddin_num = emb_matrix.weight.shape[1]

        self.block1 = Block(2, self.embeddin_num, max_len, hidden_num)
        self.block2 = Block(3, self.embeddin_num, max_len, hidden_num)
        self.block3 = Block(4, self.embeddin_num, max_len, hidden_num)
        self.block4 = Block(5, self.embeddin_num, max_len, hidden_num)

        self.classifier = nn.Linear(hidden_num * 4, class_num)  # 2 * 3
        self.loss_fun = nn.CrossEntropyLoss()

    def forward(self, batch_idx, batch_label = None):
        batch_emb = self.emb_matrix(batch_idx)
        b1_result = self.block1.forward(batch_emb)
        b2_result = self.block2.forward(batch_emb)
        b3_result = self.block3.forward(batch_emb)
        b4_result = self.block4.forward(batch_emb)

        feature = torch.cat([b1_result, b2_result, b3_result, b4_result], dim=1)# 1* 6 : [ batch * (3 * 2)]
        pre = self.classifier(feature)

        if batch_label is not None:
            loss = self.loss_fun(pre, batch_label)
            return loss
        else:
            return torch.argmax(pre, dim=-1)

主函数,训练模型并测试:

if __name__ == "__main__":
    train_text, train_label = read_data("train")
    dev_text, dev_label = read_data("dev")

    embeddin_num = 50
    max_len = 20
    batch_size = 200
    epoch = 1000
    lr = 0.001
    hidden_num = 2
    class_num = len(set(train_label))
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    word_2_index, words_embedding = built_curpus(train_text, embeddin_num)#构建字库,并且embedding

    train_dataset = TextDataset(train_text, train_label, word_2_index, max_len)
    train_loader = DataLoader(train_dataset, batch_size, shuffle=False)#没有乱序

    dev_dataset = TextDataset(dev_text, dev_label, word_2_index, max_len)
    dev_loader = DataLoader(dev_dataset, batch_size, shuffle=False)

    model = TextCNNModel(words_embedding, max_len, class_num, hidden_num).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)

    for e in range(epoch):
        for batch_idx, batch_label in train_loader:
            batch_idx = batch_idx.to(device)
            batch_label = batch_label.to(device)
            loss = model.forward(batch_idx, batch_label)
            loss.backward()
            opt.step()
            opt.zero_grad()

        print(f"loss:{loss:.3f}")

        right_num = 0
        for batch_idx, batch_label in dev_loader:
            batch_idx = batch_idx.to(device)
            batch_label = batch_label.to(device)
            pre = model.forward(batch_idx)
            right_num += int(torch.sum(pre == batch_label))

        print(f"acc = {right_num/len(dev_text)*100:.2f}%")

猜你喜欢

转载自blog.csdn.net/fzz97_/article/details/129069349