minGPT
minGPT copied to clipboard
<|endoftext|> token isn't encoded correctly
import torch
from mingpt.bpe import BPETokenizer
tokenizer = BPETokenizer()
print(tokenizer("<|endoftext|>")) # tensor([[ 27, 91, 437, 1659, 5239, 91, 29]])
print(tokenizer.decode(torch.tensor([50256]))) # '<|endoftext|>'
print(tokenizer(tokenizer.decode(torch.tensor([50256])))) # tensor([[ 27, 91, 437, 1659, 5239, 91, 29]])