fastertransformer_backend
fastertransformer_backend copied to clipboard
Ragged Batching on Megatron Fast Transformer Backend
I followed the tutorial to deploy NeMo Megatron on Triton and it was working well. But I wanted to add ragged batching, so I just added allow_ragged_batch: true
to the config file resulting in this entry for input_ids
. This resulted in the model crashing. Is there something else I need to do to allow ragged batching? If this doesn't work, is there any recommended approach to doing something similar to ragged batching?
name: "input_ids"
data_type: TYPE_UINT32
dims: -1
allow_ragged_batch: true
}
Here's the error:
terminate called after throwing an instance of 'std::runtime_error'
what(): [FT][ERROR] CUDA runtime error: an illegal memory access was encountered /opt/fastertransformer_backend/build/_deps/repo-ft-src/src/fastertransformer/utils/memory_utils.cu:96
Signal (6) received.
0# 0x000056459FF0AC19 in tritonserver
1# 0x00007F7F6B298090 in /usr/lib/x86_64-linux-gnu/libc.so.6
2# gsignal in /usr/lib/x86_64-linux-gnu/libc.so.6
3# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
4# 0x00007F7F6B651911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
5# 0x00007F7F6B65D38C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
6# 0x00007F7F6B65D3F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
7# 0x00007F7F6B65D6A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
8# 0x00007F7EFD57E065 in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
9# fastertransformer::invokeLengthCriterion(bool*, bool*, int*, unsigned int const*, int, int, int, CUstream_st*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
10# fastertransformer::DynamicDecodeLayer<float>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
11# fastertransformer::ParallelGpt<__nv_bfloat16>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__nv_bfloat16> const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
12# ParallelGptTritonModelInstance<__nv_bfloat16>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
13# 0x00007F7F6076D08A in /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
14# 0x00007F7F6B689DE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
15# 0x00007F7F6C89E609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
16# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
Signal (11) received.
0# 0x000056459FF0AC19 in tritonserver
1# 0x00007F7F6B298090 in /usr/lib/x86_64-linux-gnu/libc.so.6
2# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
3# 0x00007F7F6B651911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
4# 0x00007F7F6B65D38C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
5# 0x00007F7F6B65D3F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
6# 0x00007F7F6B65D6A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
7# 0x00007F7EFD57E065 in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
8# fastertransformer::invokeLengthCriterion(bool*, bool*, int*, unsigned int const*, int, int, int, CUstream_st*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
9# fastertransformer::DynamicDecodeLayer<float>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
10# fastertransformer::ParallelGpt<__nv_bfloat16>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__nv_bfloat16> const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
11# ParallelGptTritonModelInstance<__nv_bfloat16>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
12# 0x00007F7F6076D08A in /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
13# 0x00007F7F6B689DE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
14# 0x00007F7F6C89E609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
15# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
have you added the following parameters? or you can share the config.pbtxt
. You can take a look at the config example.
batch_input [
{
kind: BATCH_ITEM_SHAPE
target_name: "input_ids_item_shape"
data_type: TYPE_INT32
source_input: "input_ids"
}
]
I'll double check thanks
@PerkzZheng after adding that I am getting a different error:
terminate called after throwing an instance of 'std::runtime_error'
what(): [FT][ERROR] CUDA runtime error: an illegal memory access was encountered /opt/fastertransformer_backend/build/_deps/repo-ft-src/src/fastertransformer/utils/memory_utils.cu:96
Signal (6) received.
0# 0x0000558D09030C19 in tritonserver
1# 0x00007FA6B2275090 in /usr/lib/x86_64-linux-gnu/libc.so.6
2# gsignal in /usr/lib/x86_64-linux-gnu/libc.so.6
3# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
4# 0x00007FA6B262E911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
5# 0x00007FA6B263A38C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
6# 0x00007FA6B263A3F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
7# 0x00007FA6B263A6A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
8# 0x00007FA63557E065 in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
9# fastertransformer::invokeLengthCriterion(bool*, bool*, int*, unsigned int const*, int, int, int, CUstream_st*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
10# fastertransformer::DynamicDecodeLayer<float>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11$
:basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::b$
sic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fas$
ertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>
> >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*) in /opt/tritonserver/backends/fastertransformer/lib$
ransformer-shared.so
11# fastertransformer::ParallelGpt<__nv_bfloat16>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx1$
::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::$
asic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fa$
tertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char$
> >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__nv_bfloat16$
const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
12# ParallelGptTritonModelInstance<__nv_bfloat16>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::_$
cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cx$
11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
13# 0x00007FA6A009F08A in /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
14# 0x00007FA6B2666DE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
15# 0x00007FA6B387B609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
16# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
Signal (11) received.
0# 0x0000558D09030C19 in tritonserver
1# 0x00007FA6B2275090 in /usr/lib/x86_64-linux-gnu/libc.so.6
2# abort in /usr/lib/x86_64-linux-gnu/libc.so.6
3# 0x00007FA6B262E911 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
4# 0x00007FA6B263A38C in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
5# 0x00007FA6B263A3F7 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
6# 0x00007FA6B263A6A9 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
7# 0x00007FA63557E065 in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
8# fastertransformer::invokeLengthCriterion(bool*, bool*, int*, unsigned int const*, int, int, int, CUstream_st*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
9# fastertransformer::DynamicDecodeLayer<float>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
10# fastertransformer::ParallelGpt<__nv_bfloat16>::forward(std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > >*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, fastertransformer::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, fastertransformer::Tensor> > > const*, fastertransformer::ParallelGptWeight<__nv_bfloat16> const*) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
11# ParallelGptTritonModelInstance<__nv_bfloat16>::forward(std::shared_ptr<std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, triton::Tensor, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, triton::Tensor> > > >) in /opt/tritonserver/backends/fastertransformer/libtransformer-shared.so
12# 0x00007FA6A009F08A in /opt/tritonserver/backends/fastertransformer/libtriton_fastertransformer.so
13# 0x00007FA6B2666DE4 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6
14# 0x00007FA6B387B609 in /usr/lib/x86_64-linux-gnu/libpthread.so.0
15# clone in /usr/lib/x86_64-linux-gnu/libc.so.6
Here is the full config:
name: "gpt3_1.3b"
max_batch_size: 256
batch_input [
{
kind: BATCH_ITEM_SHAPE
target_name: "input_ids_item_shape"
data_type: TYPE_INT32
source_input: "input_ids"
}
]
input {
name: "input_ids"
data_type: TYPE_UINT32
dims: -1
allow_ragged_batch: true
}
input {
name: "input_lengths"
data_type: TYPE_UINT32
dims: 1
reshape {
}
}
input {
name: "request_output_len"
data_type: TYPE_UINT32
dims: -1
}
input {
name: "runtime_top_k"
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
input {
name: "runtime_top_p"
data_type: TYPE_FP32
dims: 1
reshape {
}
optional: true
}
input {
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: 1
reshape {
}
optional: true
}
input {
name: "temperature"
data_type: TYPE_FP32
dims: 1
reshape {
}
optional: true
}
input {
name: "len_penalty"
data_type: TYPE_FP32
dims: 1
reshape {
}
optional: true
}
input {
name: "repetition_penalty"
data_type: TYPE_FP32
dims: 1
reshape {
}
optional: true
}
input {
name: "random_seed"
data_type: TYPE_UINT64
dims: 1
reshape {
}
optional: true
}
input {
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: 1
reshape {
}
optional: true
}
input {
name: "beam_width"
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
input {
name: "start_id"
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
input {
name: "end_id"
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
input {
name: "stop_words_list"
data_type: TYPE_INT32
dims: 2
dims: -1
optional: true
}
input {
name: "bad_words_list"
data_type: TYPE_INT32
dims: 2
dims: -1
optional: true
}
input {
name: "prompt_learning_task_name_ids"
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
input {
name: "request_prompt_embedding"
data_type: TYPE_FP16
dims: -1
dims: -1
optional: true
}
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
input {
name: "stop_words_list"
data_type: TYPE_INT32
dims: 2
dims: -1
optional: true
}
input {
name: "bad_words_list"
data_type: TYPE_INT32
dims: 2
dims: -1
optional: true
}
input {
name: "prompt_learning_task_name_ids"
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
input {
name: "request_prompt_embedding"
data_type: TYPE_FP16
dims: -1
dims: -1
optional: true
} input {
name: "request_prompt_lengths"
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
input {
name: "request_prompt_type"
data_type: TYPE_UINT32
dims: 1
reshape {
}
optional: true
}
output {
name: "output_ids"
data_type: TYPE_UINT32
dims: -1
dims: -1
}
output {
name: "sequence_length"
data_type: TYPE_UINT32
dims: -1
}
output {
name: "cum_log_probs"
data_type: TYPE_FP32
dims: -1
}
output {
name: "output_log_probs"
data_type: TYPE_FP32
dims: -1
dims: -1
}
instance_group {
count: 1
kind: KIND_CPU
}
default_model_filename: "1-gpu"
parameters {
key: "data_type"
value {
string_value: "bf16"
}
}
parameters {
key: "enable_custom_all_reduce"
value {
string_value: "0"
}
}
parameters {
key: "int8_mode"
value {
string_value: "0"
}
}
parameters {
key: "model_checkpoint_path"
value {
string_value: "/model_repository/gpt3_1.3b/1-gpu"
}
}
parameters {
key: "model_type"
value {
string_value: "GPT"
}
}
parameters {
key: "pipeline_para_size"
value {
string_value: "1"
}
}
parameters {
key: "tensor_para_size"
value {
string_value: "1"
}
}
backend: "fastertransformer"
model_transaction_policy {
}
dynamic_batching {
max_queue_delay_microseconds: 50000
}
looks like you are not using the latest FT ? try the latest main branch (v5.3), and set FT_DEBUG_LEVEL=DEBUG
when running again.