HuggingFace简明教程,BERT中文模型实战示例.NLP预训练模型,Transformers类库,datasets类库快速入门._哔哩哔哩_bilibili
# pip install datasets
# 加载数据集
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
# pip install transformers
# tokenizer操作
from transformers import AutoTokenizer
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# 高效数据预处理
def tokenize_function(examples):
return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# 相当于dataloaders,一次选择batch_size个数据
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)
# 去掉前几列没有用的数据列
samples = tokenized_datasets["train"][:8]
samples = {k:v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
batch = data_collator(samples)
import numpy as np
from datasets import load_metric
def compute_metrics(eval_pred):
metric = load_metric("glue", "mrpc")
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
traing_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
trainer = Trainer(
model,
traing_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
predictions = trainer.predict(tokenized_datasets["validation"])