fastertransformer_backend
fastertransformer_backend copied to clipboard
Poll failed for model directory 'ensemble': output 'OUTPUT_0' for ensemble 'ensemble' is not written
Hi, when I ensemble a fastertransformer_backend GPT model, it loaded the ensemble model failed with the error when starting the server. Could you please give some advice? Thanks.
CUDA_VISIBLE_DEVICES="0,1" /opt/tritonserver/bin/tritonserver --model-store=fastertransformer_backend/all_models/nemo-megatron-gpt-5B > log 2>&1 &
...
E0613 07:51:38.390571 142822 model_repository_manager.cc:1002] Poll failed for model directory 'ensemble': output 'OUTPUT_0' for ensemble 'ensemble' is not written
...
I0613 07:51:48.643366 142822 server.cc:264] Waiting for in-flight requests to complete.
I0613 07:51:48.643407 142822 server.cc:280] Timeout 30: Found 0 model versions that have in-flight inferences
I0613 07:51:48.643479 142822 server.cc:295] All models are stopped, unloading models
I0613 07:51:48.643495 142822 server.cc:302] Timeout 30: Found 1 live models and 0 in-flight non-inference requests
I0613 07:51:48.643572 142822 libfastertransformer.cc:1965] TRITONBACKEND_ModelInstanceFinalize: delete instance state
I0613 07:51:48.643651 142822 libfastertransformer.cc:1899] TRITONBACKEND_ModelFinalize: delete model state
I0613 07:51:48.643688 142822 libfastertransformer.cc:1904] TRITONBACKEND_ModelFinalize: MPI Finalize
I0613 07:51:48.730007 142822 model_lifecycle.cc:579] successfully unloaded 'fastertransformer' version 1
I0613 07:51:49.643671 142822 server.cc:302] Timeout 29: Found 0 live models and 0 in-flight non-inference requests
error: creating server: Internal - failed to load all models
Reproduce: model weight: https://huggingface.co/nvidia/nemo-megatron-gpt-5B ensemble example: https://github.com/triton-inference-server/fastertransformer_backend/tree/main/all_models/gpt
In my case, I only ensemble the fastertransformer model. My model resposity fastertransformer_backend/all_models/nemo-megatron-gpt-5B
includes ensemble
and fastertransformer
folders.
# fastertransformer_backend/all_models/nemo-megatron-gpt-5B/ensemble/config.pbtxt
name: "ensemble"
platform: "ensemble"
max_batch_size: 1024
input [
{
name: "INPUT_0"
data_type: TYPE_UINT32
dims: [ -1 ]
allow_ragged_batch: true
},
{
name: "INPUT_1"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "INPUT_2"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "INPUT_3"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "runtime_top_k"
data_type: TYPE_UINT32
dims: [ 1 ]
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "random_seed"
data_type: TYPE_UINT64
dims: [ 1 ]
optional: true
},
{
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: [ 1 ]
optional: true
},
{
name: "beam_width"
data_type: TYPE_UINT32
dims: [ 1 ]
optional: true
},
{
name: "start_id"
data_type: TYPE_UINT32
dims: [ 1 ]
optional: true
},
{
name: "end_id"
data_type: TYPE_UINT32
dims: [ 1 ]
optional: true
},
{
name: "prompt_learning_task_name_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
optional: true
},
{
name: "request_prompt_embedding"
data_type: TYPE_FP16
dims: [ -1, -1 ]
optional: true
},
{
name: "request_prompt_lengths"
data_type: TYPE_UINT32
dims: [ 1 ]
optional: true
},
{
name: "request_prompt_type"
data_type: TYPE_UINT32
dims: [ 1 ]
optional: true
},
{
name: "top_p_decay"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "top_p_min"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "top_p_reset_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
optional: true
}
]
output [
{
name: "OUTPUT_0"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "response_input_lengths"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "cum_log_probs"
data_type: TYPE_FP32
dims: [ -1 ]
},
{
name: "output_log_probs"
data_type: TYPE_FP32
dims: [ -1, -1 ]
}
]
ensemble_scheduling {
step [
{
model_name: "fastertransformer"
model_version: -1
input_map {
key: "input_ids"
value: "INPUT_0"
}
input_map {
key: "input_lengths"
value: "input_lengths"
}
input_map {
key: "request_output_len"
value: "INPUT_1"
}
input_map {
key: "prompt_learning_task_name_ids"
value: "prompt_learning_task_name_ids"
}
input_map {
key: "request_prompt_embedding"
value: "request_prompt_embedding"
}
input_map {
key: "request_prompt_lengths"
value: "request_prompt_lengths"
}
input_map {
key: "request_prompt_type"
value: "request_prompt_type"
}
input_map {
key: "runtime_top_k"
value: "runtime_top_k"
}
input_map {
key: "runtime_top_p"
value: "runtime_top_p"
}
input_map {
key: "beam_search_diversity_rate"
value: "beam_search_diversity_rate"
}
input_map {
key: "temperature"
value: "temperature"
}
input_map {
key: "len_penalty"
value: "len_penalty"
}
input_map {
key: "repetition_penalty"
value: "repetition_penalty"
}
input_map {
key: "random_seed"
value: "random_seed"
}
input_map {
key: "is_return_log_probs"
value: "is_return_log_probs"
}
input_map {
key: "beam_width"
value: "beam_width"
}
input_map {
key: "start_id"
value: "start_id"
}
input_map {
key: "end_id"
value: "end_id"
}
input_map {
key: "stop_words_list"
value: "INPUT_2"
}
input_map {
key: "bad_words_list"
value: "INPUT_3"
}
input_map {
key: "top_p_decay"
value: "top_p_decay"
}
input_map {
key: "top_p_min"
value: "top_p_min"
}
input_map {
key: "top_p_reset_ids"
value: "top_p_reset_ids"
}
output_map {
key: "output_ids"
value: "OUTPUT_0"
}
output_map {
key: "sequence_length"
value: "sequence_length"
}
output_map {
key: "response_input_lengths"
value: "response_input_lengths"
}
output_map {
key: "cum_log_probs"
value: "cum_log_probs"
}
output_map {
key: "output_log_probs"
value: "output_log_probs"
}
}
]
}
# fastertransformer_backend/all_models/nemo-megatron-gpt-5B/fastertransformer/config.pbtxt
name: "fastertransformer"
backend: "fastertransformer"
default_model_filename: "gpt3_345M"
max_batch_size: 1024
model_transaction_policy {
decoupled: False
}
dynamic_batching {
max_queue_delay_microseconds: 50000
}
batch_input [
{
kind: BATCH_ITEM_SHAPE
target_name: "input_ids_item_shape"
data_type: TYPE_INT32
source_input: "input_ids"
}
]
input [
{
name: "input_ids"
data_type: TYPE_UINT32
dims: [ -1 ]
allow_ragged_batch: true
},
{
name: "input_lengths"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "runtime_top_k"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "random_seed"
data_type: TYPE_UINT64
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "is_return_context_embeddings"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_width"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "start_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "end_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "stop_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "bad_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "prompt_learning_task_name_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "request_prompt_embedding"
data_type: TYPE_FP16
dims: [ -1, -1 ]
optional: true
allow_ragged_batch: false
},
{
name: "request_prompt_lengths"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "request_prompt_type"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_decay"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_min"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_reset_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "response_input_lengths"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "cum_log_probs"
data_type: TYPE_FP32
dims: [ -1 ]
},
{
name: "output_log_probs"
data_type: TYPE_FP32
dims: [ -1, -1 ]
},
{
name: "context_embeddings"
data_type: TYPE_FP32
dims: [ -1, -1 ]
}
]
instance_group [
{
count: 1
kind : KIND_CPU
}
]
parameters {
key: "tensor_para_size"
value: {
string_value: "2"
}
}
parameters {
key: "pipeline_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "data_type"
value: {
string_value: "fp16"
}
}
parameters {
key: "model_type"
value: {
string_value: "GPT"
}
}
parameters {
key: "model_checkpoint_path"
value: {
string_value: "models/nemo-megatron-gpt-5B/2-gpu/"
}
}
parameters {
key: "int8_mode"
value: {
string_value: "0"
}
}
parameters {
key: "enable_custom_all_reduce"
value: {
string_value: "0"
}
}