server
server copied to clipboard
Memory over 100% with decoupled dali video model
Description Triton uses over 100% of physical memory and freezes the server when using a decoupled dali model with a long video input.
Triton Information
Docker nvcr.io/nvidia/tritonserver:24.03-py3
To Reproduce
resize_384/1/dali.py
import nvidia.dali as dali
from nvidia.dali.plugin.triton import autoserialize #must include
@dali.plugin.triton.autoserialize
@dali.pipeline_def(batch_size=1, num_threads=1, output_dtype=dali.types.FLOAT, output_ndim=3)
def pipeline():
vid = dali.fn.experimental.inputs.video(name="INPUT", sequence_length=1, device='mixed')
height = 384
width = 640
vid = dali.fn.resize(vid, resize_x=width, resize_y=height, mode="default", interp_type=dali.types.DALIInterpType.INTERP_CUBIC) #resize
vid = dali.fn.crop(vid, crop_w=width, crop_h=height, out_of_bounds_policy="pad") #pad
vid = dali.fn.squeeze(vid, axes=0) #remove sequence dim
#vid = dali.fn.color_space_conversion(vid, image_type=dali.types.BGR, output_type=dali.types.RGB) #BGR to RGB
vid = dali.fn.cast(vid, dtype=dali.types.FLOAT) #UINT8 to FP32
vid = vid / dali.types.Constant(255)
vid = dali.fn.transpose(vid, perm=[2, 0, 1], name="OUTPUT") #HWC to CHW
return vid
resize_384/config.pbtxt
backend: "dali"
max_batch_size: 0
model_transaction_policy {
decoupled: True
}
instance_group [
{
count: 1
kind: KIND_GPU
gpus: [ 1 ]
}
]
client.py
#!/usr/bin/env python
import argparse
import numpy as np
import tritonclient.grpc
from functools import partial
import queue
import cv2
import time
import gc
import torch
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-u', '--url', type=str, required=False, default='localhost:8001',
help='Inference server URL. Default is localhost:8001.')
parser.add_argument('-v', '--video', type=str, required=False, default='/home/ubuntu/test_videos/office.mp4',
help='Path to video file')
return parser.parse_args()
requests = queue.Queue()
def callback(queue, result, error):
if error:
print(error)
queue.put(error)
else:
queue.put(result)
def main():
FLAGS = parse_args()
cam = cv2.VideoCapture(FLAGS.video)
frame_count = int(cam.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cam.get(cv2.CAP_PROP_FPS)
del cam
gc.collect()
with tritonclient.grpc.InferenceServerClient(url=FLAGS.url) as triton_client:
# Start stream
triton_client.start_stream(callback=partial(callback, requests))
# Config output
outputs = []
outputs.append(tritonclient.grpc.InferRequestedOutput("OUTPUT"))
# Config input 0: video
video_raw = np.expand_dims(np.fromfile(FLAGS.video, dtype=np.uint8), axis=0)
inputs = [
tritonclient.grpc.InferInput("INPUT", video_raw.shape, "UINT8"),
]
inputs[0].set_data_from_numpy(video_raw)
print(f"{time.perf_counter() - tic:0.4f} |", FLAGS.video, frame_count, video_raw.shape)
triton_client.async_stream_infer(model_name="resize_384", inputs=inputs, outputs=outputs)
# Collect results
frame_num=0
while True:
data_item = requests.get()
bbox = torch.from_numpy(data_item.as_numpy("OUTPUT").copy())
if frame_num%1000==0:
print(f"{time.perf_counter() - tic:0.4f} | {frame_num}/{frame_count}")
frame_num+=1
if frame_num>=frame_count:
break
triton_client.close()
if __name__ == '__main__':
tic = time.perf_counter()
main()
print(f"{time.perf_counter() - tic:0.4f} | Done")
Run triton server with docker run -it --gpus=all --net=host --rm -v /home/ubuntu:/home/ubuntu nvcr.io/nvidia/tritonserver:24.03-py3 tritonserver --model-repository=/home/ubuntu/model_repository
, then run client.py
with video of ~18min.
Triton server memory usage grows to 75GB of RAM.
If using a longer video, RAM memory usage goes over 100% and the server freezes.
Expected behavior RAM usage should not go over 100% and should not depend on input video length.