tokenizers
tokenizers copied to clipboard
Serializing k-mer style pre-tokenizer
> Totally understandable.
What kind of pre-tokenizer are you saving ? If some building blocks are missing we could add them to make the thing more composable/portable/shareable.
Hi @Narsil , k-mer tokenization is used in many applications in bioinformatics. Right now I am doing the following to define my tokenizer, save and load my model, which I now know is not ideal. I wondered if there is a way to use serializable building blocks to save/load the tokenizer as any other HF tokenizer. Thank you
from itertools import product
import torch
from torchtext.data.utils import get_tokenizer
from tokenizers import Tokenizer,PreTokenizedString, NormalizedString
from tokenizers.pre_tokenizers import PreTokenizer, Whitespace
from tokenizers.models import WordLevel
from typing import List, Tuple
#Define the pre-tokenizer steps (Just split the string in chunks of size k)
class KmerPreTokenizer:
def __init__(self, k: int, stride=None):
self.k = k
self.stride = k if not stride else stride
def split(self, i: int, normalized: NormalizedString) -> List[Tuple[str, Tuple[int, int]]]:
seq = normalized.original
splits = [normalized[i:i + self.k] for i in range(0, len(seq) - self.k + 1, self.stride)]
return splits
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.split)
class KmerDecoder:
def decode(self, tokens: List[str]) -> str:
return "".join(tokens)
# Build the vocabulary
k = 4
good_kmers = []
bad_kmers = []
kmers = [''.join(kmer) for kmer in product('ACGTN',repeat=k)]
for kmer in kmers:
if "N" in kmer:
bad_kmers.append(kmer)
else:
good_kmers.append(kmer)
kmers = good_kmers + bad_kmers
vocab=dict((word, i) for i,word in enumerate(kmers))
#Use the Vocab and the pre-tokenizer to get a customized k-mer tokenizer
Create a WordLevel model from the vocabulary list
tok = Tokenizer(WordLevel(vocab=vocab, unk_token="[UNK]"))
tok.pre_tokenizer = PreTokenizer.custom(KmerPreTokenizer(k))
#tok.decoder = Decoder.custom(KmerDecoder())
# Optional: Train the tokenizer (if you want to add more tokens or further refine it)
# trainer = WordLevelTrainer(special_tokens=["<MASK>", "<CLS>", "<UNK>"])
# tokenizer.train_from_iterator(kmer_iter, trainer)
# Save or use the tokenizer
# tokenizer.save("path/to/tokenizer.json")
input = "ACGCGCGCGTGGAGCGCGATCGACTTT"
print("PreTokenize:", input)
print(tok.pre_tokenizer.pre_tokenize_str(input))
#Save the tokenizer
from transformers import PreTrainedTokenizerFast
tok.pre_tokenizer = Whitespace()
new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tok)
#new_tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(KmerPreTokenizer(k))
# Save the fast tokenizer
new_tokenizer.save_pretrained("tokenizers")
#Load the tokenizer
from transformers import AutoTokenizer
# Load the tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained("tokenizers")
loaded_tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(KmerPreTokenizer(k))
# Test the loaded tokenizer
input_text = "ACGCGCGCGTGGAGCGCGATCGACNTTTT"
print(loaded_tokenizer.tokenize(input_text))
print(loaded_tokenizer(input_text))
Originally posted by @millanp95 in https://github.com/huggingface/tokenizers/issues/581#issuecomment-2388785949