使用HuggingFace做一个简单的情感分析模型

HuggingFace简明教程,BERT中文模型实战示例.NLP预训练模型,Transformers类库,datasets类库快速入门._哔哩哔哩_bilibili


# pip install datasets
# 加载数据集
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")


# pip install transformers
# tokenizer操作
from transformers import AutoTokenizer
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


# 高效数据预处理
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


# 相当于dataloaders,一次选择batch_size个数据
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

# 去掉前几列没有用的数据列
samples = tokenized_datasets["train"][:8]
samples = {k:v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}

batch = data_collator(samples)


import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer


traing_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    traing_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
predictions = trainer.predict(tokenized_datasets["validation"])

猜你喜欢

转载自blog.csdn.net/weixin_43135178/article/details/129597954