server
server copied to clipboard
Tritonserver segfaulting when sending multiple async requests
Description
I'm getting the following error when sending multiple async requests to my Python backend.
On the client side I get this error:
[StatusCode.UNAVAILABLE] Broken pipe
On the server side I'm getting:
Signal (11) received.
0# 0x00005BB7C71852AD in tritonserver
1# 0x00007CBE66776520 in /lib/x86_64-linux-gnu/libc.so.6
2# 0x00007CBE671AADC7 in /opt/tritonserver/bin/../lib/libtritonserver.so
3# TRITONBACKEND_ResponseNewFromFactory in /opt/tritonserver/bin/../lib/libtritonserver.so
4# 0x00007CBE6383D16E in /opt/tritonserver/backends/python/libtriton_python.so
5# 0x00007CBE6383DD40 in /opt/tritonserver/backends/python/libtriton_python.so
6# 0x00007CBE6384FB9D in /opt/tritonserver/backends/python/libtriton_python.so
7# 0x00007CBE667CDEE8 in /lib/x86_64-linux-gnu/libc.so.6
8# 0x00007CBE638324EB in /opt/tritonserver/backends/python/libtriton_python.so
9# 0x00007CBE63867B92 in /opt/tritonserver/backends/python/libtriton_python.so
10# 0x00007CBE63857DDC in /opt/tritonserver/backends/python/libtriton_python.so
11# 0x00007CBE6385834D in /opt/tritonserver/backends/python/libtriton_python.so
12# 0x00007CBE6384C474 in /opt/tritonserver/backends/python/libtriton_python.so
13# 0x00007CBE667C8AC3 in /lib/x86_64-linux-gnu/libc.so.6
14# 0x00007CBE6685AA40 in /lib/x86_64-linux-gnu/libc.so.6
Triton Information
Built Triton myself last week so it's pretty up to date.
To Reproduce
The Python backend I'm using is pretty trivial, all it's doing is sending an HTTP request to another endpoint with the request parameters.
def execute(self, requests: List["pb_utils.InferenceRequest"]) -> List["pb_utils.InferenceResponse"]:
for request in requests:
asyncio.run_coroutine_threadsafe(
self._completions(request),
self._loop)
return None
async def _completions(self, request: "pb_utils.InferenceRequest"):
response_sender = request.get_response_sender()
params = json.loads(request.parameters())
try:
response = requests.post("http://localhost:80/v1/completions", json=params)
assert response.status_code == 200
response_json = json.dumps(response.json())
response_sender.send(pb_utils.InferenceResponse(output_tensors=[
pb_utils.Tensor("output", np.asarray(response_json, dtype=np.object_)),
]))
except Exception as e:
print(e)
finally:
response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
My client code is something like:
def callback(result, error):
if error:
print(f"error: {error}")
return
response = result.get_response()
output = result.as_numpy("output").item().decode("utf-8")
d = json.loads(output)
results[d["request_id"]] = d
client.start_stream(callback=callback)
def f():
for i in range(100):
p = parameters.copy()
key = str(random.randint(1,999999))
p["request_id"] = key
client.async_stream_infer(model_name, inputs, outputs=outputs, parameters=p)
while key not in results:
pass
threads = []
for i in range(10):
t = threading.Thread(target=f)
t.start()
threads.append(t)
time.sleep(1/10)
for t in threads:
t.join()
client.stop_stream()
Expected behavior
No segfault
This thread looks related, but the proposed root cause doesn't look relevant to my case: https://github.com/triton-inference-server/server/issues/4491
Hi @zchenyu, can you try replicating the issue on a pre-build Triton container? It is because we may not be able to provide support for custom build Triton server. You can pull one from our NGC catalog: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags.
Interesting, I can't reproduce with nvcr.io/nvidia/tritonserver:23.10-py3
I don't have the exact commit that I built the image with, but I have a rough timestamp. Here's some debug information:
Build command:
python build.py \
--backend=python \
--repoagent=checksum \
--cache=local \
--endpoint=grpc \
--enable-gpu \
--enable-logging \
--enable-stats \
--enable-metrics \
--enable-gpu-metrics \
--enable-cpu-metrics \
--enable-tracing \
--enable-nvtx
$ docker inspect xxx
...
"Created": "2023-11-13T22:11:21.355410303Z",
"DockerVersion": "23.0.3",
"Config": {
...
"Env": [
"PATH=/opt/tritonserver/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin",
"CUDA_VERSION=12.2.2.009",
"CUDA_DRIVER_VERSION=535.104.05",
"CUDA_CACHE_DISABLE=1",
"NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS=",
"_CUDA_COMPAT_PATH=/usr/local/cuda/compat",
"ENV=/etc/shinit_v2",
"BASH_ENV=/etc/bash.bashrc",
"SHELL=/bin/bash",
"NVIDIA_REQUIRE_CUDA=cuda>=9.0",
"NCCL_VERSION=2.19.3",
"CUBLAS_VERSION=12.2.5.6",
"CUFFT_VERSION=11.0.8.103",
"CURAND_VERSION=10.3.3.141",
"CUSPARSE_VERSION=12.1.2.141",
"CUSOLVER_VERSION=11.5.2.141",
"CUTENSOR_VERSION=1.7.0.1",
"NPP_VERSION=12.2.1.4",
"NVJPEG_VERSION=12.2.2.4",
"CUDNN_VERSION=8.9.5.29",
"TRT_VERSION=8.6.1.6+cuda12.0.1.011",
"TRTOSS_VERSION=23.10",
"NSIGHT_SYSTEMS_VERSION=2023.3.1.92",
"NSIGHT_COMPUTE_VERSION=2023.2.2.3",
"DALI_VERSION=1.30.0",
"DALI_BUILD=9783408",
"POLYGRAPHY_VERSION=0.49.0",
"TRANSFORMER_ENGINE_VERSION=0.12",
"LD_LIBRARY_PATH=/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_DRIVER_CAPABILITIES=compute,utility,video",
"NVIDIA_PRODUCT_NAME=Triton Server",
"GDRCOPY_VERSION=2.3",
"HPCX_VERSION=2.16rc4",
"MOFED_VERSION=5.4-rdmacore39.0",
"OPENUCX_VERSION=1.15.0",
"OPENMPI_VERSION=4.1.5rc2",
"RDMACORE_VERSION=39.0",
"OPAL_PREFIX=/opt/hpcx/ompi",
"OMPI_MCA_coll_hcoll_enable=0",
"LIBRARY_PATH=/usr/local/cuda/lib64/stubs:",
"NVIDIA_TRITON_SERVER_BASE_VERSION=23.10",
"NVIDIA_BUILD_ID=<unknown>",
"TRITON_SERVER_VERSION=2.41.0dev",
"NVIDIA_TRITON_SERVER_VERSION=23.12dev",
"UCX_MEM_EVENTS=no",
"TF_ADJUST_HUE_FUSED=1",
"TF_ADJUST_SATURATION_FUSED=1",
"TF_ENABLE_WINOGRAD_NONFUSED=1",
"TF_AUTOTUNE_THRESHOLD=2",
"TRITON_SERVER_GPU_ENABLED=1",
"TRITON_SERVER_USER=triton-server",
"DEBIAN_FRONTEND=noninteractive",
"TCMALLOC_RELEASE_RATE=200",
"DCGM_VERSION=3.2.6"
],
"Image": "sha256:93942c41b1e9a43d902d758a5f4faab1c47e816895ab6a6c52cc23346e64beec",
"Labels": {
"com.nvidia.build.id": "<unknown>",
"com.nvidia.build.ref": "",
"com.nvidia.cublas.version": "12.2.5.6",
"com.nvidia.cuda.version": "9.0",
"com.nvidia.cudnn.version": "8.9.5.29",
"com.nvidia.cufft.version": "11.0.8.103",
"com.nvidia.curand.version": "10.3.3.141",
"com.nvidia.cusolver.version": "11.5.2.141",
"com.nvidia.cusparse.version": "12.1.2.141",
"com.nvidia.cutensor.version": "1.7.0.1",
"com.nvidia.nccl.version": "2.19.3",
"com.nvidia.npp.version": "12.2.1.4",
"com.nvidia.nsightcompute.version": "2023.2.2.3",
"com.nvidia.nsightsystems.version": "2023.3.1.92",
"com.nvidia.nvjpeg.version": "12.2.2.4",
"com.nvidia.tensorrt.version": "8.6.1.6+cuda12.0.1.011",
"com.nvidia.tensorrtoss.version": "23.10",
"com.nvidia.tritonserver.version": "2.41.0dev",
"com.nvidia.volumes.needed": "nvidia_driver",
"org.opencontainers.image.ref.name": "ubuntu",
"org.opencontainers.image.version": "22.04"
}
},
"Architecture": "amd64",
"Os": "linux",
"Size": 9401638502,
"VirtualSize": 9401638502,
"GraphDriver": {
"Data": {
"LowerDir": "/var/lib/docker/overlay2/d75813542e3e7b15f52c3c0c6fb641d9f2bc337a2cacf2038e4213eb4cd0886e/diff:/var/lib/docker/overlay2/c1282e41843a03fc801713e21e9064633658ea5a884fba3ba337f039909255f0/diff:/var/lib/docker/overlay2/5f1a4cd28dfc0be0d82d7469a42b5a7339d634685d8c0779106f42aeff4348fe/diff:/var/lib/docker/overlay2/450e3dc583323e273d2430e8733b15e757a187b5006b44e156e88b2dd92ef72a/diff:/var/lib/docker/overlay2/dbbff2a12bf5ef2f8c0130fd0333512760231313a1a4281566e8e4eefbb87a45/diff:/var/lib/docker/overlay2/c5472de0761a7efdb700e86a64a6cc93054124e09c27380102a6d119e170b67c/diff:/var/lib/docker/overlay2/983abd7136948d8ab6d59997c73a05526d9b3d3e7402e3670b82bb4a32af1cf8/diff:/var/lib/docker/overlay2/210d322c12846d5ec02fb9ba16396fbeb1c383be0e058eeea33f499ce50c7c26/diff:/var/lib/docker/overlay2/be06ef52bdaa2cf8790fb5d0f1ed343d468fd7dcf181fa8739c09667fd569a6c/diff:/var/lib/docker/overlay2/ec9fa773be1a2e420114e6d2691ef9f7c0bc857a5cc54f2134c7cafd5704f936/diff:/var/lib/docker/overlay2/1fa153d3254394bc272cd2a06965eedc7e5142864044e23bc057370b82a3078a/diff:/var/lib/docker/overlay2/29de04475bb4f4da2e3978b1765eb7f86b11e20d3878ff9b7939bad8969195a6/diff:/var/lib/docker/overlay2/21556fc658849fd12f08f93f25e6698adb5cd26f31dfcdcd44bde10e32957765/diff:/var/lib/docker/overlay2/abe3a3aa52ccad7b7a15df930732bd203b543d19ac622cc6152e7dd7f4aa66da/diff:/var/lib/docker/overlay2/6fc7c812b1db034372e837b6f14ed5a81fa56fee316e7d5994cc92588543b5f5/diff:/var/lib/docker/overlay2/7cc2edfaa36054a101d5cf8c1d9698ab5194f0c6c9479b20622c00a88aed50ad/diff:/var/lib/docker/overlay2/9769f387c96aa5f11aaae6380dbc23abbf198000fe58939874c4163dbc00d700/diff:/var/lib/docker/overlay2/fc8ba61ddd51ecc3bbfee57f6f5075f48ae2939bf97ba98375d5231c3417d644/diff:/var/lib/docker/overlay2/59de1b5b1244e06a8847bd0afc8cba1d2c2131903cd1bf5b8e196ac4a55c0e18/diff:/var/lib/docker/overlay2/42574c2d7fa9eb78f023f49aea2bc40c2e5f486e6293159ab5a712eec9f8be6d/diff:/var/lib/docker/overlay2/5368bd3a8f358af8f2ae913c3a47fb377cbc14d560388a1b7bbf9bd5c5049374/diff:/var/lib/docker/overlay2/78c9533874522fecf81a6d5a245ca21e07b7415df9311d45a3f1615fcc090fb2/diff:/var/lib/docker/overlay2/bafa55c90a3758f7c8c4dcb99b3495bb4bdabc1582822c17c576107a36b346f9/diff:/var/lib/docker/overlay2/f56ddd480b5d89eaca181b8679b2d7a72fb85b2f019a5cc71208f5215e64dafc/diff",
"MergedDir": "/var/lib/docker/overlay2/13b180020399dbfc5d6c1c51db27f3eb83b32c9863ba0b10511523728eec35e7/merged",
"UpperDir": "/var/lib/docker/overlay2/13b180020399dbfc5d6c1c51db27f3eb83b32c9863ba0b10511523728eec35e7/diff",
"WorkDir": "/var/lib/docker/overlay2/13b180020399dbfc5d6c1c51db27f3eb83b32c9863ba0b10511523728eec35e7/work"
},
"Name": "overlay2"
},
"RootFS": {
"Type": "layers",
"Layers": [
"sha256:01d4e4b4f381ac5a9964a14a650d7c074a2aa6e0789985d843f8eb3070b58f7d",
"sha256:28dedf274dd82e720c1e5c651c5fdbcafff8f9687d4a3d95af147e0e3caa8bb2",
"sha256:d12750ac86fb921cdc38acb4d5154a52de2e4e70f42330837f01bc421194ba6c",
"sha256:f12ed123586ee818842e5bd0db946e04f39d8d679c18b275df913498feb4281c",
"sha256:13b41d13cbee5b050a1d526c2ee6b2643ad86b4998e69a5348dc43819f3fb8e0",
"sha256:dddc962cd9c168b2b79181f7c266e35ff0cc87701ab6d9fb676f9bf6561513b0",
"sha256:5f8dc6c4d9ac310ba9c68b79b12a46409e112490eeae38019187ac4a83ea6199",
"sha256:f70bf143e96789fb387d63da983c432d2717de7f2df20c90cfe0c66aeedb3041",
"sha256:f7f51f3bc96eea52cf5019d3cce2900b02797f48e7ce3ac947b74e9662ea9dd6",
"sha256:0161fdcfae36218edd6eed2c11675216b65bb5d2d0a0aef3b8c51a626ecb60c1",
"sha256:54788d2d82c90da0d14e3aa9852003bc801d68836f19b9f64d4386b8c14b2349",
"sha256:6bcd3616240bb782802144d3b62423cf7f40dedf28772113620fa294887b4ec0",
"sha256:09e624be0b72bb2a021c9d78329d9715881af55036a1cef08dd2f0a2ba5c3410",
"sha256:1db7c5886680d09dfb46e0364b27f30e29e853b64c412ed4133faf090dee8a92",
"sha256:17d01486a53eed207a7583c37e093e44ad7867c9fc3d5f39c2ba73ddf2fda447",
"sha256:338bf418d8108fc4767eab3d499b826f08dbcd655d8f211f144d3dc83c576118",
"sha256:4e7b1830b19e20994594e13200b8f34ef934bb765156700c8f3c8cc5aceebfaa",
"sha256:2f692708fcd456117d4add4bef906864317f3bbcdd1cfb787303bf1587796218",
"sha256:8e17adb029ca5f860b1f67d4b665b0e22b5decb92f6b8cf7a36d0dd95854359b",
"sha256:be9535508c17c3a28c660f3ce02dfa3cc52ce3af7a3823a26bccf725ccb7c204",
"sha256:83eab2f33db5c2072b53e4781bb4fb6acab8fa1523532fbd5a00087389fe701a",
"sha256:f9f338fcea43073298e5108ebafc74c2239ace0da58447f79529f5bde8a59616",
"sha256:f23b0b1e8134ca1451ae2838e017a6716f4d41fa4ecaa4d2e25266392f373a70",
"sha256:606e4be5087966c9f32c68a31429d1bcf350c4908c1cb51d84b2c0606d58f936",
"sha256:d009aede8f5fdb65caf00162bafa3510499f85c54b0ecbdeb230276ae937d75c"
]
},
Okay I spoke a little too soon. With the 23.10-py3 base image, I'm still getting the segfault, but only at the end of my load test.
Maybe there's some small bug in how I'm implementing my Python backend?
Here's my full model.py, mostly modeled off of https://github.com/triton-inference-server/vllm_backend/blob/main/src/model.py :
import asyncio
import json
import numpy as np
import requests
import threading
import time
import triton_python_backend_utils as pb_utils
from typing import Dict, List
import aiohttp
class TritonPythonModel:
@staticmethod
def auto_complete_config(auto_complete_model_config: pb_utils.ModelConfig) -> pb_utils.ModelConfig:
inputs = [] # We use parameters instead of tensors for inputs for ease of use.
outputs = [{
"name": "text_output",
"data_type": "TYPE_STRING",
"dims": [1],
}]
for input in inputs:
auto_complete_model_config.add_input(input)
for output in outputs:
auto_complete_model_config.add_output(output)
# We handle batching on our own.
auto_complete_model_config.set_max_batch_size(0)
# Generated results may be returned out of order.
#auto_complete_model_config.set_model_transaction_policy({"decoupled": True})
return auto_complete_model_config
def initialize(self, args: Dict[str, str]):
self._logger = pb_utils.Logger
self.model_config = model_config = json.loads(args["model_config"])
using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
model_config
)
assert using_decoupled
while True:
print("Probing...")
try:
time.sleep(1)
response = requests.post("http://localhost:80/v1/completions", json={
"prompt": "The sky is",
})
if response.status_code == 200:
break
print(response)
except Exception as e:
print(e)
self._loop = asyncio.get_event_loop()
self._loop_thread = threading.Thread(
target=self._engine_loop, args=(self._loop,)
)
self._shutdown_event = asyncio.Event()
self._loop_thread.start()
def execute(self, requests: List["pb_utils.InferenceRequest"]) -> List["pb_utils.InferenceResponse"]:
for request in requests:
asyncio.run_coroutine_threadsafe(
self._completions(request),
self._loop)
return None
def finalize(self):
self._shutdown_event.set()
if self._loop_thread is not None:
self._loop_thread.join()
self._loop_thread = None
def _engine_loop(self, loop):
asyncio.set_event_loop(loop)
self._loop.run_until_complete(self._await_shutdown())
async def _await_shutdown(self):
while self._shutdown_event.is_set() is False:
await asyncio.sleep(5)
for task in asyncio.all_tasks(loop=self._loop):
if task is not asyncio.current_task():
task.cancel()
async def _completions(self, request: "pb_utils.InferenceRequest"):
response_sender = request.get_response_sender()
params = json.loads(request.parameters())
try:
async with aiohttp.ClientSession() as session:
async with session.post("http://localhost:80/v1/completions", json=params) as response:
assert response.status == 200
response_json = await response.json()
response_str = json.dumps(response_json)
response_sender.send(pb_utils.InferenceResponse(output_tensors=[
pb_utils.Tensor("text_output", np.asarray(response_str, dtype=np.object_)),
]))
except Exception as e:
print(e)
finally:
response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
Thanks for more information on reproduction. I have filed a ticket for us to investigate further.
Having the same issue when sending multiple async requests to descendant models.