fasttext.js icon indicating copy to clipboard operation
fasttext.js copied to clipboard

I like Your code in https://github.com/huggingface/tokenizers/issues/1076

Open DoctorSlimm opened this issue 1 year ago • 4 comments

DoctorSlimm avatar Apr 12 '23 20:04 DoctorSlimm

Thanks for reference have a look at hf-tokenizers-experiments Here I have put together the whole tokenizer pipeline for SentencePiece BPE Tokenizer.

loretoparisi avatar Apr 12 '23 21:04 loretoparisi

🤩🤩🤩

DoctorSlimm avatar Apr 12 '23 21:04 DoctorSlimm

@loretoparisi do you know why the node tokenizer returns zero padding tokens after the input ids are finished? { input_ids: [ 101, 7592, 2088, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

let { promisify } = require("util");
let { Tokenizer } = require("tokenizers/bindings/tokenizer");


( async () => {
    const tokenizer = Tokenizer.fromFile('./MiniLM-L6-v2/tokenizer.json')
    console.log(tokenizer);

    const encode = promisify(tokenizer.encode.bind(tokenizer));
    const decode = promisify(tokenizer.decode.bind(tokenizer));

    const encoded = await encode("Hello World!");

    const modelInputs = {
        input_ids: encoded.getIds(),
        attention_mask: encoded.getAttentionMask(),
        token_type_ids: encoded.getTypeIds()
    }

    console.log(modelInputs);

})();

DoctorSlimm avatar Apr 12 '23 21:04 DoctorSlimm

@loretoparisi do you know why the node tokenizer returns zero padding tokens after the input ids are finished? ```{

input_ids: [

101, 7592, 2088, 999, 102, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   0, 0, 0, 0, 0, 0, 0, 0,

  0,    0,    0,   0,   ```

let { promisify } = require("util");

let { Tokenizer } = require("tokenizers/bindings/tokenizer");





( async () => {

    const tokenizer = Tokenizer.fromFile('./MiniLM-L6-v2/tokenizer.json')

    console.log(tokenizer);



    const encode = promisify(tokenizer.encode.bind(tokenizer));

    const decode = promisify(tokenizer.decode.bind(tokenizer));



    const encoded = await encode("Hello World!");



    const modelInputs = {

        input_ids: encoded.getIds(),

        attention_mask: encoded.getAttentionMask(),

        token_type_ids: encoded.getTypeIds()

    }



    console.log(modelInputs);



})();

Have a look to my examples here

you will find that there is an option to pad to max length the input:

var lpTokenizer = await LPSentencePieceBPETokenizer.fromOptions({
        padMaxLength: false,
        vocabFile: "../vocab/minilm/minilm-vocab.json"
        , mergesFile: "../vocab/minilm/minilm-merges.txt"
    });

this is necessary to feed the model with the correct (fixed) size (typically the max sequence size of the input).

loretoparisi avatar Apr 13 '23 06:04 loretoparisi