import torch
import evaluate
import numpy as np
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AdamW
from transformers import get_scheduler
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
Finetuning a model with Transformers
= "bert-base-uncased"
checkpoint = load_dataset('glue', 'mrpc') raw_datasets
Found cached dataset glue (/home/akhlak/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 907.92it/s]
= AutoTokenizer.from_pretrained(checkpoint)
tokenizer
def tokenize_function(subset):
return tokenizer(subset['sentence1'], subset['sentence2'],
=True) truncation
= raw_datasets.map(tokenize_function, batched=True)
tokeninzed_datasets = DataCollatorWithPadding(tokenizer=tokenizer) data_collator
Loading cached processed dataset at /home/akhlak/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-000c138c4db1edcb.arrow
Loading cached processed dataset at /home/akhlak/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-ae70e2242c4dfe9f.arrow
# Update dataset for model, the classes should be in column 'labels'
= tokeninzed_datasets.rename_column("label", "labels")
tokeninzed_datasets
# Remove original raw columns
= tokeninzed_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokeninzed_datasets
# Return pytorch tensors, not python lists
"torch") tokeninzed_datasets.set_format(
= DataLoader(tokeninzed_datasets['train'],
train_dataloader =True, batch_size=8,
shuffle=data_collator)
collate_fn= DataLoader(tokeninzed_datasets['validation'],
eval_dataloader =8, collate_fn=data_collator) batch_size
# Check dataloader batch
for batch in train_dataloader:
break
for k, v in batch.items()} {k : v.shape
{'labels': torch.Size([8]),
'input_ids': torch.Size([8, 70]),
'token_type_ids': torch.Size([8, 70]),
'attention_mask': torch.Size([8, 70])}
= AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) model
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
# Check model is working
= model(**batch)
outputs print(outputs.loss, outputs.logits.shape)
tensor(0.7214, grad_fn=<NllLossBackward0>) torch.Size([8, 2])
# Optimizer
= AdamW(model.parameters(), lr=5e-5) optimizer
/home/akhlak/micromamba/envs/pytorch/lib/python3.11/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
= 3
num_epochs = num_epochs * len(train_dataloader)
num_training_steps = get_scheduler("linear", optimizer=optimizer,
lr_scheduler =0,
num_warmup_steps=num_training_steps)
num_training_stepsprint(num_training_steps)
1377
= torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device
model.to(device) device
device(type='cuda')
= tqdm(range(num_training_steps))
progress_bar
model.train()for epoch in range(num_epochs):
for batch in train_dataloader:
= { k : v.to(device) for k, v in batch.items()}
batch = model(**batch)
outputs = outputs.loss
loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()1) progress_bar.update(
100%|██████████| 1377/1377 [02:05<00:00, 11.88it/s]
= evaluate.load("glue", "mrpc")
metric eval()
model.
for batch in eval_dataloader:
= {k : v.to(device) for k, v in batch.items()}
batch with torch.no_grad():
= model(**batch)
outputs
= outputs.logits
logits = torch.argmax(logits, dim=-1)
predictions =predictions, references=batch['labels'])
metric.add_batch(predictions
metric.compute()
Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 4.16MB/s]
{'accuracy': 0.8676470588235294, 'f1': 0.9072164948453608}