tokenizers
tokenizers copied to clipboard
Add display capabilities to tokenizers objects
>>> from tokenizers import Tokenizer
>>> Tokenizer.from_pretrained("ArthurZ/new-t5-base")
Tokenizer(normalizer=normalizers.Sequence([normalizers.Precompiled(), normalizers.Strip(strip_left=false, strip_right=true), normalizers.Replace(pattern=Regex(" {2,}"), content="▁", regex=SysRegex { regex: Regex { raw: 0x1069ca350 } }]), pre_tokenizer=PreTokenizer(pretok=Metaspace(replacement='▁', prepend_scheme="first", split=true)), model=Unigram(vocab={'<pad>': 0, '</s>': 0, '<unk>': 0, '▁': -2.012292861938477, 'X': -2.486478805541992, ...}, unk_id=2, bos_id=32101, eos_id=32102), post_processor=TemplateProcessing(single=Template([Sequence { id: A, type_id: 0 }, SpecialToken { id: "</s>", type_id: 0 }]), pair=Template([Sequence { id: A, type_id: 0 }, SpecialToken { id: "</s>", type_id: 0 }, Sequence { id: B, type_id: 0 }, SpecialToken { id: "</s>", type_id: 0 }])), decoder=Metaspace(replacement='▁', prepend_scheme="first", split=true), added_vocab=AddedVocabulary(added_tokens_map_r={
0: AddedToken(content="<pad>", single_word=false, lstrip=false, rstrip=false, normalized=false, special=true),
1: AddedToken(content="</s>", single_word=false, lstrip=false, rstrip=false, normalized=false, special=true),
2: AddedToken(content="<unk>", single_word=false, lstrip=false, rstrip=false, normalized=false, special=true), ...}, encode_special_tokens=false), truncation=None, padding=None)