server
server copied to clipboard
tensorrt backend coredump at using cudaGraph with output_copy_stream True
Description TensorRT backend with cudaGraph enabled and output_copy_stream True causes segfault errors.
Below are the backtraces for the core dump.
#0 0x00007f6e301549e9 in triton::backend::tensorrt::TRTv3Interface::SetTensorAddress(nvinfer1::IExecutionContext*)
() from /opt/tritonserver/backends/tensorrt/libtriton_tensorrt.so
#1 0x00007f6e3017f346 in triton::backend::tensorrt::TRTv3Interface::BuildCudaGraph(triton::backend::tensorrt::TensorRTContext*, triton::backend::tensorrt::GraphSpec const&) ()
from /opt/tritonserver/backends/tensorrt/libtriton_tensorrt.so
#2 0x00007f6e30185d65 in triton::backend::tensorrt::ModelInstanceState::InitializeCudaGraph() ()
from /opt/tritonserver/backends/tensorrt/libtriton_tensorrt.so
#3 0x00007f6e3018ab19 in triton::backend::tensorrt::ModelInstanceState::Create(triton::backend::tensorrt::ModelState*, TRITONBACKEND_ModelInstance*, triton::backend::tensorrt::ModelInstanceState**) ()
from /opt/tritonserver/backends/tensorrt/libtriton_tensorrt.so
#4 0x00007f6e30136729 in TRITONBACKEND_ModelInstanceInitialize ()
from /opt/tritonserver/backends/tensorrt/libtriton_tensorrt.so
#5 0x00007f6e444533af in triton::core::TritonModelInstance::CreateInstance(triton::core::TritonModel*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long, TRITONSERVER_instancegroupkind_enum, int, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > const&, inference::ModelRateLimiter const&, bool, std::map<unsigned int, std::shared_ptr<triton::core::TritonModelInstance::TritonBackendThread>, std::less<unsigned int>, std::allocator<std::pair<unsigned int const, std::shared_ptr<triton::core::TritonModelInstance::TritonBackendThread> > > >*, std::vector<triton::core::TritonModelInstance::SecondaryDevice, std::allocator<triton::core::TritonModelInstance::SecondaryDevice> > const&) () from /opt/tritonserver/bin/../lib/libtritonserver.so
#6 0x00007f6e44454ae8 in triton::core::TritonModelInstance::CreateInstances(triton::core::TritonModel*, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > > const&, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > > const&, inference::ModelConfig const&, bool) ()
from /opt/tritonserver/bin/../lib/libtritonserver.so
#7 0x00007f6e444469a5 in triton::core::TritonModel::Create(triton::core::InferenceServer*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::vector<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > > const&, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > >, std::hash<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > > const&, long, inference::ModelConfig, bool, std::unique_ptr<triton::core::TritonModel, std::default_delete<triton::core::TritonModel> >*) () from /opt/tritonserver/bin/../lib/libtritonserver.so
#8 0x00007f6e44518d13 in triton::core::ModelLifeCycle::CreateModel(triton::core::ModelIdentifier const&, long, triton::core::ModelLifeCycle::ModelInfo*, bool) () from /opt/tritonserver/bin/../lib/libtritonserver.so
#9 0x00007f6e4451e430 in std::_Function_handler<void (), triton::core::ModelLifeCycle::AsyncLoad(triton::core::ModelIdentifier const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, inference::ModelConfig const&, bool, std::shared_ptr<triton::core::TritonRepoAgentModelList> const&, std::function<void (triton::core::Status)>&&)::{lambda()#1}>::_M_invoke(std::_Any_data const&) ()
from /opt/tritonserver/bin/../lib/libtritonserver.so
#10 0x00007f6e44659b10 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<triton::common::ThreadPool::ThreadPool(unsigned long)::{lambda()#1}> > >::_M_run() () from /opt/tritonserver/bin/../lib/libtritonserver.so
#11 0x00007f6e43ea7de4 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#12 0x00007f6e43fbb609 in start_thread (arg=<optimized out>) at pthread_create.c:477
#13 0x00007f6e43b92133 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Triton Information I'm using google cloud platform''s compute engine as my host and I run container on this host. Container got 4 core of CPU, 8GB of memory, and shared memory(/dev/shm) is 64MB size.
I use nvcr.io/nvidia/tritonserver:23.04 image to run tritonserver container Below are config.pbtxt which I used.
name: "inference"
platform: "tensorrt_plan"
backend: "tensorrt"
max_batch_size : 64
input [
{
name: "input_ids"
data_type: TYPE_INT32
dims: [ 140 ]
},
{
name: "attention_mask"
data_type: TYPE_INT32
dims: [ 140 ]
}
]
output [
{
name: "outputs"
data_type: TYPE_FP32
dims: [ 3 ]
}
]
dynamic_batching{
max_queue_delay_microseconds: 100000
}
instance_group[
{
count : 1
kind: KIND_GPU
gpus: [ 0 ]
}
]
optimization{
graph: {
level : 1
},
eager_batching : 1,
cuda: {
graphs: 1,
graph_spec: [
{
batch_size: 64
}
]
output_copy_stream: 1
}
}
How did I make tensorrt plan file First. I converted my pytorch weight to onnx with pytorch's export function and use below options
- opset : 17
- dynamic : True additionally I simplified my onnx model with onnxsim module.
Then I use ncvr.io/nvidia/tensorrt:23.04 's trtexec to make tensorrt from my onnx file. I specified minShapes, optShapes, maxShapes for optimization profile and use int8 precision for my plan file. Also I activate useCudaGraph option so that after optimization end, tenssorrt can evaluate performance with cuda graph. It succeeds on tensorrt container.
My pytorch model is kind of BERT (https://github.com/huggingface/transformers.git)
To Reproduce ~~I will provide link to tensorrt planfile which made with empty weight and configuration files that can reproduce this error ASAP.~~ I can't provide exact same weight I described above, but I find out simple resnet example shows same problem for my environment. (may be container is problem?) Below are link to example to reproduce. (https://drive.google.com/drive/folders/11GnrxKggHd6by39sSpplaJLAb5jJuS-v?usp=sharing)
you can just reproduce error with
tritonserver --model-repository=resnset
I additionally provide below things at the shared link
- log.tx : It is output logs of tritonserver(with --log-verbose and --log-info option)
- model.onnx : This is ONNX file that I used to make tensorrt plan, I get it from (https://github.com/onnx/models/blob/main/vision/classification/resnet/model/resnet18-v1-7.onnx)
- report1.nsys-rep : I use nsys profile to get information about GPU instruction when core dumped.
Expected behavior CudaGraph support without segfault
meet the same problem when use triton in docker nvcr.io/nvidia/tritonserver:23.12-py3
, is there any solution?
We have created a ticket to investigate.
Ref: 6187