executeV2: Error Code 1: Cask (Cask Pooling Runner Execute Failure)
when I use the lateset expressing for loading the resnet-50, it can run successfully
# 加载预训练的 ResNet-50 模型
model= models.resnet50(ResNet50_Weights.IMAGENET1K_V1)
.......
context.execute_v2(bindings)
cuda.memcpy_dtoh(output_data,d_output)
output:
[[-1.99739981e+00 -8.73850882e-01 -2.61901110e-01 -5.06821036e-01
-8.42439711e-01 -1.29197025e+00 -1.12298012e+00 -7.87284374e-01
-1.09715271e+00 -8.88255715e-01 6.76214874e-01 1.26331055e+00
9.71547604e-01 1.26690733e+00 2.13557646e-01 7.49732792e-01
9.17353332e-01 -1.35699183e-01 1.02075779e+00 1.39578366e+00
-3.95577759e-01 1.82343590e+00 1.22511053e+00 1.37567306e+00
6.19295895e-01 -4.07072157e-01 -3.15917075e-01 4.38034356e-01
2.15873435e-01 -1.61093962e+00 -2.36747146e+00 1.51434287e-01
-2.26472473e+00 -1.59191358e+00 1.72776997e-01 -1.72481596e+00
-3.97043675e-01 -1.37065017e+00 1.18780577e+00 -6.37652636e-01
3.37333083e-01 -8.84323299e-01 7.01664329e-01 -1.20757654e-01
6.49434865e-01 6.22194529e-01 6.06806695e-01 -1.30204308e+00
-7.03575671e-01 -1.22511554e+00 1.10196507e+00 -1.30670393e+00
4.24225092e-01 -2.37956554......
but when I got it for pretrained
model=models.resnet50(pretrained=True)
........
context.execute_v2(bindings=[int(devide_in), int(devide_out)])
it return false
Can you export an onnx, then use trtexec to build ?
+1 if you can share an ONNX model / reproduction steps.
notebookbdffc79b63.zip this is my notebook on kaggle, engine is too large so I can't submit it . if run the notebook, you can get same result
pip install tensorrt
print(trt.version)
import tensorrt
#load model
import torch
import torchvision.models as models
model= models.resnet50(pretrained=True)
import tensorrt as trt
import torch
import torchvision.models as models
from torchvision.models import ResNet50_Weights
model.eval()
x = torch.randn(1, 3, 224, 224)
torch.onnx.export(model, x, "resnet50.onnx", opset_version=11,training=torch.onnx.TrainingMode.EVAL,
do_constant_folding=True,input_names=["input"],
output_names=["output"],
verbose=False)
#验证导出的ONNX模型格式是否正确 import onnx onnx_model = onnx.load("/kaggle/working/resnet50.onnx") # load onnx model onnx.checker.check_model(onnx_model) # check onnx model logger=trt.Logger(trt.Logger.WARNING) builder=trt.Builder(logger) network=builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) parser=trt.OnnxParser(network,logger) success=parser.parse_from_file("/kaggle/working/resnet50.onnx")
config=builder.create_builder_config() config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 5<<30) config.get_memory_pool_limit serialized_engine=builder.build_serialized_network(network,config) logger=trt.Logger(trt.Logger.WARNING) runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
def save_engine(engine, path): path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) with open(path, 'wb') as f: f.write(engine)
def load_engine(path): with open(path, 'rb') as f: engine = runtime.deserialize_cuda_engine(f.read()) return engine path="resnet_engine" save_engine(serialized_engine,path) engine=load_engine(path) context=engine.create_execution_context()
import ctypes import numpy as np from PIL import Image def prerocess_image(image_path,input_shape): image=Image.open(image_path) image=image.resize(input_shape[2:]) image_array=np.array(image,dtype=np.float32)
image_array=image_array/255.0
image_array = np.expand_dims(image_array, axis=0)
image_array=image_array.transpose((0, 3, 1, 2))
return image_array
import numpy as np import torch input_shape = (1, 3, 224, 224) output_shape = (1, 1000) dtype = np.float32
#input_data = np.random.rand(*input_shape).astype(dtype) #input_data = np.ascontiguousarray(input_data) image_path="/kaggle/input/test-image-3/8cec3814fbe3524455caf2ebb6183bf4.jpeg" input_shape=(1,3,224,224) input_data=prerocess_image(image_path,input_shape) input_data=np.ascontiguousarray(input_data)
output_data = np.empty(output_shape, dtype=dtype) output_data = np.ascontiguousarray(output_data)
import pycuda.driver as cuda import pycuda.autoinit
stream = cuda.Stream() import pycuda.driver as cuda import pycuda.autoinit
d_input = cuda.mem_alloc(input_data.nbytes) d_output = cuda.mem_alloc(output_data.nbytes)
if d_input is None or d_output is None: raise RuntimeError("CUDA内存分配失败")
stream = cuda.Stream()
if stream is None: raise RuntimeError("PyCUDA流对象创建失败")
cuda.memcpy_htod_async(d_input, input_data, stream)
context.set_tensor_address('input', int(d_input)) context.set_tensor_address('output', int(d_output))
bindings = [int(d_input), int(d_output)]
context.execute_v2(bindings)
cuda.memcpy_dtoh(output_data,d_output)
print(output_data)
@jiangchengchengark did you solve the issue? I'm getting a similar error...