Fine-tuning GPT with Transformers and Pytorch




Import

from transformers import AutoModelWithLMHead,AutoTokenizer
import torch

Init

model = AutoModelWithLMHead.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Prepare data

context = 'Harry Potter is a series of'
label = 'seven fantasy novels'
context_input = tokenizer(context)
label_input = tokenizer(label)
label_input['attention_mask'] = [0]*len(label_input['input_ids'])
context_input,label_input
({'input_ids': [18308, 14179, 318, 257, 2168, 286], 'attention_mask': [1, 1, 1, 1, 1, 1]},
 {'input_ids': [26548, 8842, 16122], 'attention_mask': [0, 0, 0]})
model_input = {}
model_input['input_ids'] = context_input['input_ids'] + label_input['input_ids']
model_input['attention_mask'] = context_input['attention_mask'] + label_input['attention_mask']
model_input['labels'] = model_input['input_ids'][:]
for i,(l,a) in enumerate(zip(model_input['labels'],model_input['attention_mask'])):
    if a == 1: model_input['labels'][i] = -100
model_input
{'input_ids': [18308, 14179, 318, 257, 2168, 286, 26548, 8842, 16122],
 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0],
 'labels': [-100, -100, -100, -100, -100, -100, 26548, 8842, 16122]}
for key in model_input.keys():
    model_input[key] = torch.LongTensor(model_input[key])
model_input
{'input_ids': tensor([18308, 14179,   318,   257,  2168,   286, 26548,  8842, 16122]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0]),
 'labels': tensor([ -100,  -100,  -100,  -100,  -100,  -100, 26548,  8842, 16122])}
outputs = model(**model_input,return_dict=True)
outputs.keys()
odict_keys(['loss', 'logits', 'past_key_values'])

Fine-tuning

from transformers import AdamW
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-4)
for key in model_input.keys():
    model_input[key] = model_input[key].to(device)
for epoch in range(20):
    optim.zero_grad()
    outputs = model(**model_input,return_dict=True)
    loss = outputs['loss']
    print(loss)
    loss.backward()
    optim.step()
tensor(7.8934, device='cuda:0', grad_fn=)
tensor(3.5290, device='cuda:0', grad_fn=)
tensor(2.3826, device='cuda:0', grad_fn=)
tensor(0.3651, device='cuda:0', grad_fn=)
tensor(0.0056, device='cuda:0', grad_fn=)
tensor(1.4604, device='cuda:0', grad_fn=)
tensor(0.5182, device='cuda:0', grad_fn=)
tensor(0.0006, device='cuda:0', grad_fn=)
tensor(0.0176, device='cuda:0', grad_fn=)
tensor(0.0001, device='cuda:0', grad_fn=)
tensor(0.0001, device='cuda:0', grad_fn=)
tensor(0.1220, device='cuda:0', grad_fn=)
tensor(0.0001, device='cuda:0', grad_fn=)
tensor(2.2433, device='cuda:0', grad_fn=)
tensor(0.0061, device='cuda:0', grad_fn=)
tensor(1.3986, device='cuda:0', grad_fn=)
tensor(0.1735, device='cuda:0', grad_fn=)
tensor(0.0014, device='cuda:0', grad_fn=)
tensor(0.0446, device='cuda:0', grad_fn=)
tensor(0.0134, device='cuda:0', grad_fn=)

Overfitting test

context = 'Harry Potter is a series of'
input_ids = tokenizer(context,return_tensors='pt')['input_ids'].to(device)
model.eval()
sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=10, 
    top_k=10, 
    top_p=0.75, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
Setting pad_token_id to eos_token_id:50256 for open-end generation.

Output:
----------------------------------------------------------------------------------------------------
0: Harry Potter is a series ofseven fantasy novels novels
1: Harry Potter is a series ofseven fantasy novels novels
2: Harry Potter is a series ofseven fantasy novels novels

Source Code

Refs

發佈留言

發佈留言必須填寫的電子郵件地址不會公開。 必填欄位標示為 *

這個網站採用 Akismet 服務減少垃圾留言。進一步瞭解 Akismet 如何處理網站訪客的留言資料