TensorRT Error Code 1: Cuda Driver (context is destroyed) CUDA_ERROR_CONTEXT_IS

Error Code 1: Cuda Driver (context is destroyed) CUDA_ERROR_CONTEXT_IS_DESTROYED

Open stq054188 opened this issue 5 months ago • 4 comments

Description

[09/12/2024-14:32:23] [TRT] [E] 1: [defaultAllocator.cpp::nvinfer1::internal::DefaultAllocator::deallocate::42] Error Code 1: Cuda Runtime (invalid argument) [09/12/2024-14:32:23] [TRT] [E] 1: [defaultAllocator.cpp::nvinfer1::internal::DefaultAllocator::deallocate::42] Error Code 1: Cuda Runtime (invalid argument) [09/12/2024-14:32:23] [TRT] [E] 1: [defaultAllocator.cpp::nvinfer1::internal::DefaultAllocator::deallocate::42] Error Code 1: Cuda Runtime (invalid argument) [09/12/2024-14:32:23] [TRT] [E] 1: [defaultAllocator.cpp::nvinfer1::internal::DefaultAllocator::deallocate::42] Error Code 1: Cuda Runtime (invalid argument) [09/12/2024-14:32:23] [TRT] [E] 1: [defaultAllocator.cpp::nvinfer1::internal::DefaultAllocator::deallocate::42] Error Code 1: Cuda Runtime (invalid argument) [09/12/2024-14:32:23] [TRT] [E] 1: [defaultAllocator.cpp::nvinfer1::internal::DefaultAllocator::deallocate::42] Error Code 1: Cuda Runtime (invalid argument) [09/12/2024-14:32:23] [TRT] [E] 1: [cudaDriverHelpers.cpp::nvinfer1::CuDeleter<struct CUmod_st *,&enum cudaError_enum __cdecl nvinfer1::cuModuleUnloadWrapper(struct CUmod_st *)>::operator ()::30] Error Code 1: Cuda Driver (context is destroyed) [09/12/2024-14:32:23] [TRT] [E] 1: [cudaDriverHelpers.cpp::nvinfer1::CuDeleter<struct CUmod_st *,&enum cudaError_enum __cdecl nvinfer1::cuModuleUnloadWrapper(struct CUmod_st *)>::operator ()::30] Error Code 1: Cuda Driver (context is destroyed) CUDA_ERROR_CONTEXT_IS_DESTROYED

Environment

TensorRT Version: 8.5.3.1

NVIDIA GPU: RTX3080

NVIDIA Driver Version:

CUDA Version: 11.6

CUDNN Version: 8.9.5.30

Operating System: Win11

Python Version (if applicable): 3.8

Tensorflow Version (if applicable):

PyTorch Version (if applicable): 1.13.1

Baremetal or Container (if so, version):

Relevant Files

Model link: https://drive.google.com/drive/folders/16JJ274kvQfyUFzRmLV08hqkyib7OPE0U?usp=drive_link

Steps To Reproduce

Commands or scripts: `import warnings warnings.filterwarnings("ignore")

import ctypes import os import numpy as np import cv2 import random import tensorrt as trt import pycuda.autoinit import pycuda.driver as cuda

import torch import math from torchvision.ops import roi_align import argparse import os import platform import shutil import time from pathlib import Path import sys import json

import cv2 import torch

import numpy as np import argparse import time import cv2

from ultralytics import YOLO

class TrtInference(): _batch_size = 1 def init(self, model_path=None): self._model_path = model_path if self._model_path is None: print("please set trt model path!") exit() self.cuda_ctx = cuda.Device(0).make_context() #self.stream = cuda.Stream()

    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')
    runtime = trt.Runtime(TRT_LOGGER)

    # deserialize engine
    with open(self._model_path, 'rb') as f:
        buf = f.read()
        self.engine = runtime.deserialize_cuda_engine(buf)

    self.context = self.engine.create_execution_context() 
    #---------------------------#

    for index, binding in enumerate(self.engine):
        if self.engine.binding_is_input(binding):
            batch_shape = list(self.engine.get_binding_shape(binding)).copy()
            batch_shape[0] = self._batch_size
            self.context.set_binding_shape(index, batch_shape)
    self.host_inputs, self.host_outputs, self.cuda_inputs, self.cuda_outputs, self.bindings = self._allocate_buffers()
    

def _allocate_buffers(self):
    host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings = \
        [], [], [], [], []
    for index, binding in enumerate(self.engine):
        size = trt.volume(self.context.get_binding_shape(index)) * \
               self.engine.max_batch_size
        host_mem = cuda.pagelocked_empty(size, np.float32)
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings.append(int(cuda_mem))
        if self.engine.binding_is_input(binding):
            host_inputs.append(host_mem)
            cuda_inputs.append(cuda_mem)
        else:
            host_outputs.append(host_mem)
            cuda_outputs.append(cuda_mem)
    return host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings

#def destroy(self):
def __del__(self):
    """Free CUDA memories and context."""   
    print('---del func---')    
    # del self.host_inputs
    # del self.host_outputs
    # del self.cuda_inputs
    # del self.cuda_outputs
    # del self.bindings

    # del self.engine
    # #del self.stream
    # self.cuda_ctx.pop()
    # del self.cuda_ctx
    #print(self.cuda_ctx.detach())
    #pycuda.tools.clear_context_caches()
    #cuda.DeviceAllocation.free()

def inference(self, inputs):
    self.cuda_ctx.push()
    stream = cuda.Stream()
    
    try:
        np.copyto(self.host_inputs[0], inputs[0].ravel())
        np.copyto(self.host_inputs[1], inputs[1].ravel())
        np.copyto(self.host_inputs[2], inputs[2].ravel())
    except:
        return None

    cuda.memcpy_htod_async(
        self.cuda_inputs[0], self.host_inputs[0], stream)
    cuda.memcpy_htod_async(
        self.cuda_inputs[1], self.host_inputs[1], stream)
    cuda.memcpy_htod_async(
        self.cuda_inputs[2], self.host_inputs[2], stream)
    self.context.execute_async(
        batch_size=1,
        bindings=self.bindings,
        stream_handle=stream.handle)

    cuda.memcpy_dtoh_async(
        self.host_outputs[0], self.cuda_outputs[0], stream)

    stream.synchronize()
    self.cuda_ctx.pop()

    output = np.reshape(self.host_outputs[0],(32,15))

    return output

class ActionPredict: def init(self,slowfast_model_path='',yolo_model_path='', slowfast_label_path=''):

    self.slowfast_model_path = slowfast_model_path
    self.yolo_model_path = yolo_model_path
    self.slowfast_label_path = slowfast_label_path

    self.action = ''
    self.name_list = []
    self.box_list = []
 
    self.action_names = self.get_class_names(self.slowfast_label_path)
    self.yolo_model = YOLO(self.yolo_model_path,task='detect')
    self.slowfast = TrtInference(self.slowfast_model_path)


def runActionPred(self,frames):
    name_list, rect_list = self.run_object_detection(frames)
    action = self.run_action_detection(frames,self.action_names)

    self.action = action
    self.box_list = rect_list
    self.name_list = name_list

def get_action_preds(self):
    return self.action

def get_name_list(self):
    return self.name_list

def get_box_list(self):
    return self.box_list 

def detect_objects_single(self,yolo_model,frame):
    tensor_bboxs = []
    result_list = []
    
    #------只检测中间帧-------#
    results = yolo_model(frame,conf=0.5,task='detect',verbose=False)

    #-----------动作识别使用-----------#
    boxes = results[0].boxes
    mask = boxes.cls == 2 #对应person的label序号
    pred_boxes = boxes.xyxy[mask]

    if(len(pred_boxes.tolist()) > 0):
        tensor_bboxs.append(pred_boxes)#动作识别使用
    #-----------动作识别使用-----------#
    find_person = False
    for result in results:
        if find_person:
            break
        for box in result.boxes:
            text_name = f"{result.names[int(box.cls[0])]}"
            #print(text_name)
            if text_name != 'person':
                continue
            find_person = True
            #rect = (float(box.xyxy[0][0]), float(box.xyxy[0][1]),float(box.xyxy[0][2]), float(box.xyxy[0][3]))
        
            result_list.append([float(box.xyxy[0][0]),float(box.xyxy[0][1]),float(box.xyxy[0][2]),float(box.xyxy[0][3])])
            break  

    if len(result_list)==0:
        for i in range(32-len(result_list)):
            result_list.append([float(0),float(0),float(0),float(0)])
    else:
        for i in range(1,32):
            result_list.append(result_list[0])  

    return torch.from_numpy(np.array(result_list))

def run_object_detection(self,frames):
    tensor_bboxs = []
    name_list = []
    box_list = []

    for frame in frames:
        results = self.yolo_model(frame,conf=0.5,task='detect',verbose=False)
        label_list = []
        rect_list = []

        #-----------动作识别使用-----------#
        boxes = results[0].boxes
        mask = boxes.cls == 2 #对应person的label序号
        pred_boxes = boxes.xyxy[mask]

        if(len(pred_boxes.tolist()) > 0):
            tensor_bboxs.append(pred_boxes)#动作识别使用
        #-----------动作识别使用-----------#
        for result in results:
            for box in result.boxes:
                text_name = f"{result.names[int(box.cls[0])]}"
                rect = (int(box.xyxy[0][0]), int(box.xyxy[0][1]),int(box.xyxy[0][2]), int(box.xyxy[0][3]))
                label_list.append(text_name)
                rect_list.append(rect)
        name_list.append(label_list)
        box_list.append(rect_list)
    return name_list,box_list

def run_action_detection(self,frames,classes):
    action = ''
    bboxes =  self.detect_objects_single(self.yolo_model,frames[15])

    if bboxes is not None:
        frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for frame in frames]
        frames = [self.scale(256, frame) for frame in frames]
        inputs = self.process_cv2_inputs(frames)
    
        if bboxes is not None:
            bboxes = self.scale_boxes(256,bboxes,1080,1920)
            index_pad = torch.full(
                size=(bboxes.shape[0], 1),
                fill_value=float(0),
                device=bboxes.device,
            )
            # Pad frame index for each box.
            bboxes = torch.cat([index_pad, bboxes], axis=1)
        #print('------------1-----------')
        for i in range(len(inputs)):
            inputs[i] = inputs[i].numpy()
            #print(inputs[i])
        #print('------------2-----------')
        if bboxes is not None:
            inputs.append(bboxes.numpy().astype(np.float32))
            outputs=self.slowfast.inference(inputs)
            
            if outputs is None:
                return ''    
            list_out = list(outputs[0])
            index_of_max = list_out.index(max(list_out))
            name = classes[index_of_max]
            score = list_out[index_of_max]
            #print(name + ': ' + str(score))
            str_score = '%0.2f'%score
            action = name + ': ' + str_score
            #print('-----------outputs[0]------------')
            #print(outputs)
    else:
        action = ''
    return action

def scale(self,size, image):
    """
    Scale the short side of the image to size.
    Args:
        size (int): size to scale the image.
        image (array): image to perform short side scale. Dimension is
            `height` x `width` x `channel`.
    Returns:
        (ndarray): the scaled image with dimension of
            `height` x `width` x `channel`.
    """
    height = image.shape[0]
    width = image.shape[1]
    # print(height,width)
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return image
    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
    else:
        new_width = int(math.floor((float(width) / height) * size))
    img = cv2.resize(
        image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
    )
    # print(new_width, new_height)
    return img.astype(np.float32)

def tensor_normalize(self,tensor, mean, std, func=None):
    """
    Normalize a given tensor by subtracting the mean and dividing the std.
    Args:
        tensor (tensor): tensor to normalize.
        mean (tensor or list): mean value to subtract.
        std (tensor or list): std to divide.
    """
    if tensor.dtype == torch.uint8:
        tensor = tensor.float()
        tensor = tensor / 255.0
    if type(mean) == list:
        mean = torch.tensor(mean)
    if type(std) == list:
        std = torch.tensor(std)
    if func is not None:
        tensor = func(tensor)
    tensor = tensor - mean
    tensor = tensor / std
    return tensor

def scale_boxes(self,size, boxes, height, width):
    """
    Scale the short side of the box to size.
    Args:
        size (int): size to scale the image.
        boxes (ndarray): bounding boxes to peform scale. The dimension is
        `num boxes` x 4.
        height (int): the height of the image.
        width (int): the width of the image.
    Returns:
        boxes (ndarray): scaled bounding boxes.
    """
    if (width <= height and width == size) or (
        height <= width and height == size
    ):
        return boxes

    new_width = size
    new_height = size
    if width < height:
        new_height = int(math.floor((float(height) / width) * size))
        boxes *= float(new_height) / height
    else:
        new_width = int(math.floor((float(width) / height) * size))
        boxes *= float(new_width) / width
    return boxes

def process_cv2_inputs(self,frames):
    """
    Normalize and prepare inputs as a list of tensors. Each tensor
    correspond to a unique pathway.
    Args:
        frames (list of array): list of input images (correspond to one clip) in range [0, 255].
        cfg (CfgNode): configs. Details can be found in
            slowfast/config/defaults.py
    """
    inputs = torch.from_numpy(np.array(frames)).float() / 255
    inputs = self.tensor_normalize(inputs, [0.45,0.45,0.45], [0.225,0.225,0.225])
    # T H W C -> C T H W.
    inputs = inputs.permute(3, 0, 1, 2)
    # Sample frames for num_frames specified.
    index = torch.linspace(0, inputs.shape[1] - 1, 32).long()
    #print(index)
    inputs = torch.index_select(inputs, 1, index)
    fast_pathway = inputs
    slow_pathway = torch.index_select(
            inputs,
            1,
            torch.linspace(
                0, inputs.shape[1] - 1, inputs.shape[1] // 4
            ).long(),
        )
    frame_list = [slow_pathway, fast_pathway]
    #print(np.shape(frame_list[0]))
    inputs = [inp.unsqueeze(0) for inp in frame_list]
    return inputs

def get_class_names(self,label_path):
    name_list = []
    with open(label_path, 'r') as file:
        line = file.readline()
        while line:
            #print(line.strip())  # 去掉尾部的换行符
            name = line.strip()
            name_list.append(name)
            line = file.readline()
    return name_list

def create_video_writer(video_cap, output_filename): # grab the width, height, and fps of the frames in the video stream. frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(video_cap.get(cv2.CAP_PROP_FPS)) # initialize the FourCC and a video writer object fourcc = cv2.VideoWriter_fourcc(*'MP4V') writer = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height)) return writer

if name == "main": slowfast_model_path = 'action.engine' slowfast_label_path = 'action.txt' yolo_model_path = 'object.engine'

cap = cv2.VideoCapture('./test.mp4')
#cap = cv2.VideoCapture('rtmp://10.17.170.64/live/')
print(cap.isOpened())
frame_list = []
#output_filename = "out.mp4"
#writer = create_video_writer(cap, output_filename)
predictor = ActionPredict(slowfast_model_path,yolo_model_path,slowfast_label_path)
count = 0
while True:
    _,frame = cap.read() 
    if frame is None:
        break
    frame_list.append(frame)
    if len(frame_list) == 32:
        start = cv2.getTickCount()
        predictor.runActionPred(frame_list)
        action = predictor.get_action_preds()
        name_list = predictor.get_name_list()
        box_list = predictor.get_box_list()

        # 记录结束时间    
        end = cv2.getTickCount()
        # 运行耗时
        use_time = (end - start) / cv2.getTickFrequency()
        print('use-time: %.4fs' % use_time)

        
        print(action)
        # LOGGER.info(name_list)
        # LOGGER.info(box_list)
      

        for i in range(len(frame_list)):
            img = frame_list[i]
            for j in range(0,len(name_list[i])):
                name = name_list[i][j]

                x1 = box_list[i][j][0]
                y1 = box_list[i][j][1]
                x2 = box_list[i][j][2]
                y2 = box_list[i][j][3]

                cv2.rectangle(img,(x1,y1),(x2,y2),(255,0,255),2, cv2.LINE_AA)
                cv2.putText(img,name,(x1,y1+30),0,1.5,(255,255,0),2)
                
                if name == 'person':
                    cv2.putText(img,action,(x1,y1-30),0,2.2,(0,255,0),3)
            #writer.write(img)
            showimg=cv2.resize(img,None,fx=0.5,fy=0.5,interpolation=cv2.INTER_LINEAR)
            cv2.imshow('img',showimg)
            cv2.waitKey(1)
            
        frame_list.clear()
        count += 1
        if count == 10:
           break
    else:
        continue
cap.release()
#writer.release()
#predictor.slowfast.destroy()
print('------OK------')

Have you tried the latest release?: No

Can this model run on other frameworks? For example run ONNX model with ONNXRuntime (polygraphy run <model.onnx> --onnxrt): Yes

Sep 12 '24 06:09 stq054188

TensorRT TensorRT copied to clipboard

Error Code 1: Cuda Driver (context is destroyed) CUDA_ERROR_CONTEXT_IS_DESTROYED

Description

Environment

Relevant Files

Steps To Reproduce

TensorRT
TensorRT copied to clipboard