fasttext.js
fasttext.js copied to clipboard
I like Your code in https://github.com/huggingface/tokenizers/issues/1076
Thanks for reference have a look at hf-tokenizers-experiments Here I have put together the whole tokenizer pipeline for SentencePiece BPE Tokenizer.
🤩🤩🤩
@loretoparisi do you know why the node tokenizer returns zero padding tokens after the input ids are finished? { input_ids: [ 101, 7592, 2088, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
let { promisify } = require("util");
let { Tokenizer } = require("tokenizers/bindings/tokenizer");
( async () => {
const tokenizer = Tokenizer.fromFile('./MiniLM-L6-v2/tokenizer.json')
console.log(tokenizer);
const encode = promisify(tokenizer.encode.bind(tokenizer));
const decode = promisify(tokenizer.decode.bind(tokenizer));
const encoded = await encode("Hello World!");
const modelInputs = {
input_ids: encoded.getIds(),
attention_mask: encoded.getAttentionMask(),
token_type_ids: encoded.getTypeIds()
}
console.log(modelInputs);
})();
@loretoparisi do you know why the node tokenizer returns zero padding tokens after the input ids are finished? ```{
input_ids: [
101, 7592, 2088, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ```
let { promisify } = require("util"); let { Tokenizer } = require("tokenizers/bindings/tokenizer"); ( async () => { const tokenizer = Tokenizer.fromFile('./MiniLM-L6-v2/tokenizer.json') console.log(tokenizer); const encode = promisify(tokenizer.encode.bind(tokenizer)); const decode = promisify(tokenizer.decode.bind(tokenizer)); const encoded = await encode("Hello World!"); const modelInputs = { input_ids: encoded.getIds(), attention_mask: encoded.getAttentionMask(), token_type_ids: encoded.getTypeIds() } console.log(modelInputs); })();
Have a look to my examples here
you will find that there is an option to pad to max length the input:
var lpTokenizer = await LPSentencePieceBPETokenizer.fromOptions({
padMaxLength: false,
vocabFile: "../vocab/minilm/minilm-vocab.json"
, mergesFile: "../vocab/minilm/minilm-merges.txt"
});
this is necessary to feed the model with the correct (fixed) size (typically the max sequence size of the input).