<์ถ์ฒ ๋ฐ ์ฐธ๊ณ ์๋ฃ>
https://huggingface.co/learn/nlp-course/chapter3/1
Introduction - Hugging Face NLP Course
2. Using ๐ค Transformers 3. Fine-tuning a pretrained model 4. Sharing models and tokenizers 5. The ๐ค Datasets library 6. The ๐ค Tokenizers library 9. Building and sharing demos new
huggingface.co
0. ์ ๋ฆฌ
- transformers์ Trainer๋ฅผ ์ฌ์ฉํ๋ฉด ๊ฐ๋จํ๊ฒ fine-tuning์ด ๊ฐ๋ฅํจ.
- NLP๋ชฉ์ ์ ๋ง๊ฒ ๋ชจ๋ธ์ ๋ถ๋ฌ์์ ์ฌ์ฉ(๋ถ๋ฅ, ์์ฑ ๋ฑ)
- ์ ์ฒด ์ฝ๋๋ฅผ ์ ์ํ๊ณ ์ฝ๋ ์ค๋ช
- hugging face์ ์ฝ๋๋ฅผ ์ฐธ๊ณ ํ์์.
- torch.utils.data.Dataset ์ผ๋ก pytorch ๋ฐ์ดํฐ์ ๋ง๋๋ ๋ฒ์ ์๊ณ ์์ด์ผ ํจ(์์ ๋ง์ ๋ฐ์ดํฐ์ ์ผ๋ก ํ๋ จ)
1. ์ ์ฒด์ฝ๋ ๋ฐ ์ค๋ช
#๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ(A)
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification #๋ถ๋ฅ ๋ชจ๋ธ์ ํ์ต
from transformers import AdamW #optimizer ๋ถ๋ฌ์ค๊ธฐ
#๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ถ๋ฌ์ค๊ธฐ(B)
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) #์ ๋ต์ labels ์ ์ง์
#๋ฐ์ดํฐ๋ถ๋ฌ์ค๊ธฐ(C)
from datasets import load_dataset
raw_datasets = load_dataset('glue', 'mrpc')
#ํ ํฌ๋์ด์ ์ผ๊ด์ ์ฉ์ ์ํ ํจ์(D)
def tokenizer_function(example):
return tokenizer(example['sentence1'], example['sentence2'], truncation=True)
#raw_dataset์ map ํจ์๋ฅผ ์ฌ์ฉํด์ tokenizer_function์ ๋ชจ๋ ๋ฐ์ดํฐ์ ์ ์ฉ(E)
tokenized_datasets = raw_datasets.map(tokenizer_function, batched=True)
#๋์ padding์ ์ํ ํจ์ ๋ง๋ค๊ธฐ(F)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#ํ๊ฐ ํจ์ ๋ง๋ค๊ธฐ(G)
import evaluate
import numpy as np
def compute_metrics(eval_preds):
metric = evaluate.load('glue', 'mrpc')
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
#Train์ ์ฌ์ฉํ ํ๋ผ๋ฏธํฐ ์ค์ (H)
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") #์ ์ฅ์์น"test-trainer"๋ง ์ค์ , epoch๋จ์๋ก ์ถ๋ ฅ
#Trainer ๋ถ๋ฌ์ค๊ณ ์ ์(I)
from transformers import Trainer
trainer = Trainer(
model,
training_args,
train_dataset = tokenized_datasets['train'],
eval_dataset = tokenized_datasets['validation'],
data_collator=data_collator, #Trainier์ ๊ธฐ๋ณธ data_collator๋ DataCollatorWithPadding์ด๋ผ์ ์๋ต๊ฐ๋ฅํ์ง๋ง ์จ์ฃผ๋ ๊ฒ์ด ์ข์.
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
#fine-tuning(J)
trainer.train()
2. ์์ ์ ๋ฐ์ดํฐ๋ก fine-tuning
- ๋ฐ์ดํฐ์ ํํ์ ํ๊ฐ ํจ์๋ฅผ ์์ ํด์ฃผ์ด์ผ ํจ.
- ๋ฐ์ดํฐ๊ฐ torch.utils.data.Dataset ํ์์ผ๋ก ์๋์ ๊ฐ์ ํํ๋ก Trainer์ train_dataset, eval_dataset์ ๋ค์ด๊ฐ์ผ ํจ.
{'input_ids': [[ 101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002,
2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809,
3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069,
1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010,
2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102]],
'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1]],
'label': 1}
- Dataset ๋ง๋ค๊ธฐ(์์)
from torch.utils.data import Dataset
class MyMapDataset(Dataset):
#๋ฐ์ดํฐ์
์ ์(์ด๊ธฐํ) (A)
def __init__(self, data):
self.data = data
#๋ฐ์ดํฐ์
๊ธธ์ด(B)
def __len__(self):
return len(self.data)
#์ถ๋ ฅ (C)
def __getitem__(self, index):
return {'input_ids': self.data['input_ids'][index],
'token_type_ids': self.data['token_type_ids'][index],
'attention_mask': self.data['attention_mask'][index],
'label': self.data['label'][index]}
- ์ถ๋ ฅ(C) ๋ถ๋ถ์ด ์๋์ ๊ฐ์ด dict ํํ๋ก ๋์ค๋๋ก ์์
{'input_ids': [[ 101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002,
2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809,
3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069,
1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010,
2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102]],
'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1]],
'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1]],
'label': 1}
- ํ๊ฐํจ์๋ฅผ sklearn์ accuracy_socore, f1_score๋ก ์์
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
#๋์
๋๋ฆฌ์ key๋ฅผ ํ๊ฐ ์งํ ์ด๋ฆ์ผ๋ก ์ค์
return {'accuracy': accuracy_score(labels, predictions), 'f1': f1_score(labels, predictions)}
- ์ต์ข ์์ ์ฝ๋
#๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ(A)
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification #๋ถ๋ฅ ๋ชจ๋ธ์ ํ์ต
from transformers import AdamW #optimizer ๋ถ๋ฌ์ค๊ธฐ
#๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ถ๋ฌ์ค๊ธฐ(B)
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) #์ ๋ต์ labels ์ ์ง์
#๋ฐ์ดํฐ๋ถ๋ฌ์ค๊ธฐ(C)
#pytorch ๋ฐ์ดํฐ์
ํ์์ผ๋ก ๋ณํ
from datasets import load_dataset
raw_datasets = load_dataset('glue', 'mrpc')
#๋ฐ์ดํฐ ํ์ ๋ณํ, ํจ์๋ฅผ ์ฌ์ฉํด์ ๋ณํ ์์ (D)
#pytorch ๋ฐ์ดํฐ์
__ini__ ๋ถ๋ถ์ ๋ฃ์ด๋ ๋จ.
train_data = {'input_ids': [],
'token_type_ids': [],
'attention_mask': [],
'label': []}
for i in range(3668):
tokenize = tokenizer(raw_datasets['train']['sentence1'][i], raw_datasets['train']['sentence2'][i], truncation=True)
train_data['input_ids'].append(tokenize['input_ids'])
train_data['token_type_ids'].append(tokenize['token_type_ids'])
train_data['attention_mask'].append(tokenize['attention_mask'])
train_data['label'].append(raw_datasets['train']['label'][i])
valid_data = {'input_ids': [],
'token_type_ids': [],
'attention_mask': [],
'label': []}
for i in range(408):
tokenize = tokenizer(raw_datasets['validation']['sentence1'][i], raw_datasets['validation']['sentence2'][i], truncation=True)
valid_data['input_ids'].append(tokenize['input_ids'])
valid_data['token_type_ids'].append(tokenize['token_type_ids'])
valid_data['attention_mask'].append(tokenize['attention_mask'])
valid_data['label'].append(raw_datasets['validation']['label'][i])
#pytorch dataset์ผ๋ก ๋ณํ(E)
from torch.utils.data import Dataset
class TestDataset(Dataset):
def __init__(self, data):
#(D)๋ถ๋ถ์ ๋ฐ์ดํฐ์
ํฌ๊ธฐ์ ๋ฐ๋ผ์ ๋์ ์ผ๋ก ๋ณํ ๊ฐ๋ฅํ๊ฒ ์์ ํด์ ๋ฃ๋ ๊ฒ์ด ์ข์
self.data = data
def __len__(self):
return len(self.data['label'])
def __getitem__(self, index):
return {'input_ids': self.data['input_ids'][index],
'token_type_ids': self.data['token_type_ids'][index],
'attention_mask': self.data['attention_mask'][index],
'label': self.data['label'][index]}
train_dataset = TestDataset(train_data)
eval_dataset = TestDataset(valid_data)
#๋์ padding์ ์ํ ํจ์ ๋ง๋ค๊ธฐ(F)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#ํ๊ฐ ํจ์ ๋ง๋ค๊ธฐ(G)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
return {'accuracy': accuracy_score(labels, predictions), 'f1': f1_score(labels, predictions)}
#Train์ ์ฌ์ฉํ ํ๋ผ๋ฏธํฐ ์ค์ (H)
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") #์ ์ฅ์์น"test-trainer"๋ง ์ค์ , epoch๋จ์๋ก ์ถ๋ ฅ
#Trainer ๋ถ๋ฌ์ค๊ณ ์ ์(I)
from transformers import Trainer
trainer = Trainer(
model,
training_args,
train_dataset = train_dataset,
eval_dataset = eval_dataset,
data_collator=data_collator, #Trainier์ ๊ธฐ๋ณธ data_collator๋ DataCollatorWithPadding์ด๋ผ์ ์๋ต๊ฐ๋ฅํ์ง๋ง ์จ์ฃผ๋ ๊ฒ์ด ์ข์.
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
#fine-tuning(J)
trainer.train()
'NLP > Hugging Face' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
HuggingFace(ํ๊น ํ์ด์ค) ๋ชจ๋ธ Fine-Tuning(Pytorch ์ฌ์ฉ) (0) | 2023.06.26 |
---|---|
ํ๊น ํ์ด์ค pipeline ์ฌ์ฉ๋ฒ (0) | 2023.04.03 |