FasterTransformer How to apply topk option each inputs?

Hello, first of all, thank you for creating this library. I have 2 questions.

First question, I saw this guide and successfully started Triton Server.

and Here is my request python file

#!/usr/bin/python

#!/usr/bin/python

import time
import argparse
import numpy as np
import os
import re
import sys
import requests as httpreq
from builtins import range
import statistics as s
import tritonclient.http as httpclient
from tritonclient.utils import np_to_triton_dtype
from transformers import AutoTokenizer
from transformers import PreTrainedTokenizerFast


def inference(input_data: np.ndarray, fixed_output_len: int) -> np.ndarray:
    """
    input_data: (batch_size, 1, sentence_len)
    """
    model_name = "fastertransformer"
    # shape
    input_len = np.array([[sentence.size] for sentence in input_data], np.uint32)
    output_len = np.ones_like(input_len).astype(np.uint32) * fixed_output_len

    with httpclient.InferenceServerClient(
        "localhost:8000", concurrency=1, verbose=True
    ) as client:
        inputs = [
            httpclient.InferInput(
                "INPUT_ID", input_data.shape, np_to_triton_dtype(input_data.dtype)
            ),
            httpclient.InferInput(
                "REQUEST_INPUT_LEN",
                input_len.shape,
                np_to_triton_dtype(input_len.dtype),
            ),
            httpclient.InferInput(
                "REQUEST_OUTPUT_LEN",
                output_len.shape,
                np_to_triton_dtype(output_len.dtype),
            ),
        ]
        inputs[0].set_data_from_numpy(input_data)
        inputs[1].set_data_from_numpy(input_len)
        inputs[2].set_data_from_numpy(output_len)
        # requests.append(client.async_infer(model_name, inputs))
        print("send request")
        result = client.infer(model_name, inputs)
        return result.as_numpy("OUTPUT0")


def gpt_j():
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
    batch_size = 10
    start = time.time()

    prompt = 'The Belgian national football team '
    tokens = tokenizer([prompt]*batch_size, return_tensors="np").input_ids.astype(np.uint32)
    tokens = tokens.reshape((batch_size, 1, -1))

    FIXED_OUTPUT_LEN = 31

    last_tokens = inference(tokens, FIXED_OUTPUT_LEN)
    last_tokens = last_tokens.reshape(batch_size, -1)

    generated_text = tokenizer.batch_decode(last_tokens)
    print(time.time() - start)
    print(*generated_text, sep='\n')

def main():
    gpt_j()


if __name__ == "__main__":
    main()

and here is my config.pbtxt

name: "fastertransformer"
backend: "fastertransformer"
default_model_filename: "gpt-j-6b-2gpu"
max_batch_size: 128
input [
  {
    name: "INPUT_ID"
    data_type: TYPE_UINT32
    dims: [ -1, -1 ]
  },
  {
    name: "REQUEST_INPUT_LEN"
    data_type: TYPE_UINT32
    dims: [ 1 ]
  },
  {
    name: "REQUEST_OUTPUT_LEN"
    data_type: TYPE_UINT32
    dims: [ 1 ]
  }
]
output [
  {
    name: "OUTPUT0"
    data_type: TYPE_UINT32
    dims: [ -1, -1 ]
  }
]
instance_group [
  {
    count: 1
    kind : KIND_CPU
  }
]
parameters {
  key: "top_k"
  value: {
    string_value: "50"
  }
}
parameters {
  key: "top_p"
  value: {
    string_value: "1.0"
  }
}
parameters {
  key: "tensor_para_size"
  value: {
    string_value: "2"
  }
}
parameters {
  key: "pipeline_para_size"
  value: {
    string_value: "1"
  }
}
parameters {
  key: "max_input_len"
  value: {
    string_value: "512"
  }
}
parameters {
  key: "max_seq_len"
  value: {
    string_value: "528"
  }
}
parameters {
  key: "is_half"
  value: {
    string_value: "1"
  }
}
parameters {
  key: "head_num"
  value: {
    string_value: "16"
  }
}
parameters {
  key: "size_per_head"
  value: {
    string_value: "256"
  }
}
parameters {
  key: "inter_size"
  value: {
    string_value: "16384"
  }
}
parameters {
  key: "rotary_embedding"
  value: {
    string_value: "64"
  }
}
parameters {
  key: "vocab_size"
  value: {
    string_value: "50400"
  }
}
parameters {
  key: "start_id"
  value: {
    string_value: "50256"
  }
}
parameters {
  key: "end_id"
  value: {
    string_value: "50256"
  }
}
parameters {
  key: "decoder_layers"
  value: {
    string_value: "28"
  }
}
parameters {
  key: "model_name"
  value: {
    string_value: "gpt-j-6b-2gpu"
  }
}
parameters {
  key: "beam_width"
  value: {
    string_value: "1"
  }
}
parameters {
  key: "temperature"
  value: {
    string_value: "0.9"
  }
}
parameters {
  key: "repetition_penalty"
  value: {
    string_value: "1.0"
  }
}
parameters {
  key: "len_penalty"
  value: {
    string_value: "1.0"
  }
}
parameters {
  key: "beam_search_diversity_rate"
  value: {
    string_value: "0.0"
  }
}
dynamic_batching {
  preferred_batch_size: [4, 8]
  max_queue_delay_microseconds: 200000
}
parameters {
  key: "model_type"
  value: {
    string_value: "GPT-J"
  }
}

I'm going to use this python script. I changed it to produce 10 outputs for the same 10 batch size inputs. I would like to get 10 different result values with Topk Sampling applied to each inputs. But, I get 10 same output result bellow image(not applyed topk sample each inpus)

I want get 10 different result like below image

How can i get 10 difference output? (not using dynamic batching)

Second question, Why this dev/v5.0_beta isn't it officially released? I really want to use it, but I'm curious what kind of defects are left.

Mar 29 '22 08:03 BangDaeng

Currently, when you duplicate same input into one batch, you will get same results because they use same random seed. We will add the feature to set different random seed for different sentence of one batch in the future.
We only publish the beta version because we are considering adjusting the API. There is no correctness issue in the beta release.

Mar 30 '22 09:03 byshiue

@byshiue

Is there a part where you set the seed... Could you tell me which c++ script it is in?

I'm trying to modify

Mar 30 '22 10:03 BangDaeng

The seed is set in the ParallelGpt.cc constructor.

Mar 31 '22 02:03 byshiue

Latest update in main branch has moved the top k to the runtime input. Note that when you set different top k in one batch, the speed would become slower.

Apr 17 '22 23:04 byshiue

Hi @byshiue, I found from the guide of GPT that model inputs like top_k or random_seed support tensor type: (xiii. Random_seed [1] or [batch_size, 1] on cpu, optional) However, when I tried with the gpt python example and replaced random_seed with an IntTensor, it gave me the following error:

[INFO] batch size: 4
[WARNING] gemm_config.in is not found; using default GEMM algo
[WARNING] gemm_config.in is not found; using default GEMM algo
Traceback (most recent call last):
  File "../examples/pytorch/gpt/gpt_example.py", line 245, in <module>
    main()
  File "../examples/pytorch/gpt/gpt_example.py", line 167, in main
    tokens_batch = gpt(start_ids,
  File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 744, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/workspace/FasterTransformer/examples/pytorch/gpt/../../../examples/pytorch/gpt/utils/gpt.py", line 299, in forward
    outputs = self.model.forward(start_ids,
RuntimeError: forward() Expected a value of type 'int' for argument '_11' but instead found type 'Tensor'.
Position: 11
Value: tensor([11956, 60550, 46756, 76151], dtype=torch.int32)
Declaration: forward(__torch__.torch.classes.FasterTransformer.GptOp _0, Tensor _1, Tensor _2, int _3, int _4, int _5, float _6, float _7, float _8, float _9, float _10, int _11, int _12) -> (Tensor[] _0)
Cast error details: Unable to cast Python instance to C++ type (compile in debug mode for details)

Looks like the random_seed argument only accepts int type.

Is there something I misunderstand about the guide?

May 18 '22 07:05 Billijk

For pytorch op, it only supports scalar now.

May 18 '22 10:05 byshiue

In latest release, FT gpt pytorch op has supported vectored topk and random seed.

Aug 16 '22 03:08 byshiue

Close this bug because it is inactivated. Feel free to re-open this issue if you still have any problem.

Sep 06 '22 01:09 byshiue

FasterTransformer FasterTransformer copied to clipboard

How to apply topk option each inputs?

FasterTransformer
FasterTransformer copied to clipboard