Import
from transformers import AutoModelWithLMHead,AutoTokenizer
import torch
Init
model = AutoModelWithLMHead.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
Prepare data
context = 'Harry Potter is a series of'
label = 'seven fantasy novels'
context_input = tokenizer(context)
label_input = tokenizer(label)
label_input['attention_mask'] = [0]*len(label_input['input_ids'])
context_input,label_input
({'input_ids': [18308, 14179, 318, 257, 2168, 286], 'attention_mask': [1, 1, 1, 1, 1, 1]},
{'input_ids': [26548, 8842, 16122], 'attention_mask': [0, 0, 0]})
model_input = {}
model_input['input_ids'] = context_input['input_ids'] + label_input['input_ids']
model_input['attention_mask'] = context_input['attention_mask'] + label_input['attention_mask']
model_input['labels'] = model_input['input_ids'][:]
for i,(l,a) in enumerate(zip(model_input['labels'],model_input['attention_mask'])):
if a == 1: model_input['labels'][i] = -100
model_input
{'input_ids': [18308, 14179, 318, 257, 2168, 286, 26548, 8842, 16122],
'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0],
'labels': [-100, -100, -100, -100, -100, -100, 26548, 8842, 16122]}
for key in model_input.keys():
model_input[key] = torch.LongTensor(model_input[key])
model_input
{'input_ids': tensor([18308, 14179, 318, 257, 2168, 286, 26548, 8842, 16122]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0]),
'labels': tensor([ -100, -100, -100, -100, -100, -100, 26548, 8842, 16122])}
outputs = model(**model_input,return_dict=True)
outputs.keys()
odict_keys(['loss', 'logits', 'past_key_values'])
Fine-tuning
from transformers import AdamW
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-4)
for key in model_input.keys():
model_input[key] = model_input[key].to(device)
for epoch in range(20):
optim.zero_grad()
outputs = model(**model_input,return_dict=True)
loss = outputs['loss']
print(loss)
loss.backward()
optim.step()
tensor(7.8934, device='cuda:0', grad_fn=)
tensor(3.5290, device='cuda:0', grad_fn=)
tensor(2.3826, device='cuda:0', grad_fn=)
tensor(0.3651, device='cuda:0', grad_fn=)
tensor(0.0056, device='cuda:0', grad_fn=)
tensor(1.4604, device='cuda:0', grad_fn=)
tensor(0.5182, device='cuda:0', grad_fn=)
tensor(0.0006, device='cuda:0', grad_fn=)
tensor(0.0176, device='cuda:0', grad_fn=)
tensor(0.0001, device='cuda:0', grad_fn=)
tensor(0.0001, device='cuda:0', grad_fn=)
tensor(0.1220, device='cuda:0', grad_fn=)
tensor(0.0001, device='cuda:0', grad_fn=)
tensor(2.2433, device='cuda:0', grad_fn=)
tensor(0.0061, device='cuda:0', grad_fn=)
tensor(1.3986, device='cuda:0', grad_fn=)
tensor(0.1735, device='cuda:0', grad_fn=)
tensor(0.0014, device='cuda:0', grad_fn=)
tensor(0.0446, device='cuda:0', grad_fn=)
tensor(0.0134, device='cuda:0', grad_fn=)
Overfitting test
context = 'Harry Potter is a series of'
input_ids = tokenizer(context,return_tensors='pt')['input_ids'].to(device)
model.eval()
sample_outputs = model.generate(
input_ids,
do_sample=True,
max_length=10,
top_k=10,
top_p=0.75,
num_return_sequences=3
)
print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
Setting pad_token_id
to eos_token_id
:50256 for open-end generation.
Output:
----------------------------------------------------------------------------------------------------
0: Harry Potter is a series ofseven fantasy novels novels
1: Harry Potter is a series ofseven fantasy novels novels
2: Harry Potter is a series ofseven fantasy novels novels
Source Code
Refs