FasterTransformer
FasterTransformer copied to clipboard
How to apply topk option each inputs?
Hello, first of all, thank you for creating this library. I have 2 questions.
First question, I saw this guide and successfully started Triton Server.
and Here is my request python file
#!/usr/bin/python
#!/usr/bin/python
import time
import argparse
import numpy as np
import os
import re
import sys
import requests as httpreq
from builtins import range
import statistics as s
import tritonclient.http as httpclient
from tritonclient.utils import np_to_triton_dtype
from transformers import AutoTokenizer
from transformers import PreTrainedTokenizerFast
def inference(input_data: np.ndarray, fixed_output_len: int) -> np.ndarray:
"""
input_data: (batch_size, 1, sentence_len)
"""
model_name = "fastertransformer"
# shape
input_len = np.array([[sentence.size] for sentence in input_data], np.uint32)
output_len = np.ones_like(input_len).astype(np.uint32) * fixed_output_len
with httpclient.InferenceServerClient(
"localhost:8000", concurrency=1, verbose=True
) as client:
inputs = [
httpclient.InferInput(
"INPUT_ID", input_data.shape, np_to_triton_dtype(input_data.dtype)
),
httpclient.InferInput(
"REQUEST_INPUT_LEN",
input_len.shape,
np_to_triton_dtype(input_len.dtype),
),
httpclient.InferInput(
"REQUEST_OUTPUT_LEN",
output_len.shape,
np_to_triton_dtype(output_len.dtype),
),
]
inputs[0].set_data_from_numpy(input_data)
inputs[1].set_data_from_numpy(input_len)
inputs[2].set_data_from_numpy(output_len)
# requests.append(client.async_infer(model_name, inputs))
print("send request")
result = client.infer(model_name, inputs)
return result.as_numpy("OUTPUT0")
def gpt_j():
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
batch_size = 10
start = time.time()
prompt = 'The Belgian national football team '
tokens = tokenizer([prompt]*batch_size, return_tensors="np").input_ids.astype(np.uint32)
tokens = tokens.reshape((batch_size, 1, -1))
FIXED_OUTPUT_LEN = 31
last_tokens = inference(tokens, FIXED_OUTPUT_LEN)
last_tokens = last_tokens.reshape(batch_size, -1)
generated_text = tokenizer.batch_decode(last_tokens)
print(time.time() - start)
print(*generated_text, sep='\n')
def main():
gpt_j()
if __name__ == "__main__":
main()
name: "fastertransformer"
backend: "fastertransformer"
default_model_filename: "gpt-j-6b-2gpu"
max_batch_size: 128
input [
{
name: "INPUT_ID"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "REQUEST_INPUT_LEN"
data_type: TYPE_UINT32
dims: [ 1 ]
},
{
name: "REQUEST_OUTPUT_LEN"
data_type: TYPE_UINT32
dims: [ 1 ]
}
]
output [
{
name: "OUTPUT0"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
}
]
instance_group [
{
count: 1
kind : KIND_CPU
}
]
parameters {
key: "top_k"
value: {
string_value: "50"
}
}
parameters {
key: "top_p"
value: {
string_value: "1.0"
}
}
parameters {
key: "tensor_para_size"
value: {
string_value: "2"
}
}
parameters {
key: "pipeline_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "max_input_len"
value: {
string_value: "512"
}
}
parameters {
key: "max_seq_len"
value: {
string_value: "528"
}
}
parameters {
key: "is_half"
value: {
string_value: "1"
}
}
parameters {
key: "head_num"
value: {
string_value: "16"
}
}
parameters {
key: "size_per_head"
value: {
string_value: "256"
}
}
parameters {
key: "inter_size"
value: {
string_value: "16384"
}
}
parameters {
key: "rotary_embedding"
value: {
string_value: "64"
}
}
parameters {
key: "vocab_size"
value: {
string_value: "50400"
}
}
parameters {
key: "start_id"
value: {
string_value: "50256"
}
}
parameters {
key: "end_id"
value: {
string_value: "50256"
}
}
parameters {
key: "decoder_layers"
value: {
string_value: "28"
}
}
parameters {
key: "model_name"
value: {
string_value: "gpt-j-6b-2gpu"
}
}
parameters {
key: "beam_width"
value: {
string_value: "1"
}
}
parameters {
key: "temperature"
value: {
string_value: "0.9"
}
}
parameters {
key: "repetition_penalty"
value: {
string_value: "1.0"
}
}
parameters {
key: "len_penalty"
value: {
string_value: "1.0"
}
}
parameters {
key: "beam_search_diversity_rate"
value: {
string_value: "0.0"
}
}
dynamic_batching {
preferred_batch_size: [4, 8]
max_queue_delay_microseconds: 200000
}
parameters {
key: "model_type"
value: {
string_value: "GPT-J"
}
}
I'm going to use this python script.
I changed it to produce 10 outputs for the same 10 batch size inputs.
I would like to get 10 different result values with Topk Sampling applied to each inputs.
But, I get 10 same output result bellow image(not applyed topk sample each inpus)
I want get 10 different result
like below image
How can i get 10 difference output? (not using dynamic batching)
Second question, Why this dev/v5.0_beta isn't it officially released? I really want to use it, but I'm curious what kind of defects are left.
- Currently, when you duplicate same input into one batch, you will get same results because they use same random seed. We will add the feature to set different random seed for different sentence of one batch in the future.
- We only publish the beta version because we are considering adjusting the API. There is no correctness issue in the beta release.
@byshiue
Is there a part where you set the seed... Could you tell me which c++ script it is in?
I'm trying to modify
The seed is set in the ParallelGpt.cc constructor.
Latest update in main branch has moved the top k to the runtime input. Note that when you set different top k in one batch, the speed would become slower.
Hi @byshiue, I found from the guide of GPT that model inputs like top_k or random_seed support tensor type:
(xiii. Random_seed [1] or [batch_size, 1] on cpu, optional
)
However, when I tried with the gpt python example and replaced random_seed with an IntTensor, it gave me the following error:
[INFO] batch size: 4
[WARNING] gemm_config.in is not found; using default GEMM algo
[WARNING] gemm_config.in is not found; using default GEMM algo
Traceback (most recent call last):
File "../examples/pytorch/gpt/gpt_example.py", line 245, in <module>
main()
File "../examples/pytorch/gpt/gpt_example.py", line 167, in main
tokens_batch = gpt(start_ids,
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 744, in _call_impl
result = self.forward(*input, **kwargs)
File "/workspace/FasterTransformer/examples/pytorch/gpt/../../../examples/pytorch/gpt/utils/gpt.py", line 299, in forward
outputs = self.model.forward(start_ids,
RuntimeError: forward() Expected a value of type 'int' for argument '_11' but instead found type 'Tensor'.
Position: 11
Value: tensor([11956, 60550, 46756, 76151], dtype=torch.int32)
Declaration: forward(__torch__.torch.classes.FasterTransformer.GptOp _0, Tensor _1, Tensor _2, int _3, int _4, int _5, float _6, float _7, float _8, float _9, float _10, int _11, int _12) -> (Tensor[] _0)
Cast error details: Unable to cast Python instance to C++ type (compile in debug mode for details)
Looks like the random_seed argument only accepts int type.
Is there something I misunderstand about the guide?
For pytorch op, it only supports scalar now.
In latest release, FT gpt pytorch op has supported vectored topk and random seed.
Close this bug because it is inactivated. Feel free to re-open this issue if you still have any problem.