TensorRT
TensorRT copied to clipboard
Error when creating TRT engine with INT8 calibration
Hi! I am trying to implement a code to generate TRT engine with INT8 calibration as an option, I'm basing my code on YOLOv8's engine export file (tested the code and it works fine: https://github.com/ultralytics/ultralytics/blob/main/ultralytics/engine/exporter.py#L675)
As I wanted to run it on other models, I wrote the following script after trying to remove all YOLOv8's dependencies between files:
from __future__ import print_function
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import torch
from base_opts import Opts
from pathlib import Path
import json
import sys, os
from utils import colorstr
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import glob
sys.path.insert(1, os.path.join(sys.path[0], ".."))
# Custom dataset class for image loading and preprocessing
class ImageFolderDataset(Dataset):
def __init__(self, folder_path, img_size):
self.img_paths = glob.glob(os.path.join(folder_path, '*'))
self.transform = transforms.Compose([
transforms.Resize(img_size),
transforms.ToTensor(),
])
def __len__(self):
return len(self.img_paths)
def __getitem__(self, idx):
img_path = self.img_paths[idx]
img = Image.open(img_path).convert('RGB')
img = self.transform(img)
return img
# Function to create a DataLoader for INT8 calibration
def get_int8_calibration_dataloader(folder_path, img_size=(640, 640), batch_size=16, prefix=colorstr("TensorRT:")):
"""Build and return a dataloader suitable for calibration of INT8 models."""
print(f"{prefix} collecting INT8 calibration images from 'data={folder_path}'")
dataset = ImageFolderDataset(folder_path, img_size)
if len(dataset) < 300:
print(f"WARNING ⚠️ >300 images recommended for INT8 calibration, found {len(dataset)} images.")
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
return dataloader
def export_engine(f_onnx, savedir, is_half, is_int8, data, imgsz, dynamic, batch, n_workspace, verbose, prefix=colorstr("TensorRT:")):
"""YOLOv8 TensorRT export https://developer.nvidia.com/tensorrt."""
device = 'cuda' if torch.cuda.is_available() else 'cpu'
assert device != "cpu", "export running on CPU but must be on GPU, i.e. use 'device=0'"
# check_version(trt.__version__, "7.0.0", hard=True) # require tensorrt>=7.0.0
# Setup and checks
print(f"\n{prefix} starting export with TensorRT {trt.__version__}...")
is_trt10 = int(trt.__version__.split(".")[0]) >= 10 # is TensorRT >= 10
assert Path(f_onnx).exists(), f"ONNX file not found: {f_onnx}"
basename = f_onnx.split('/')[-1]
f = os.path.join(savedir, basename.replace('.onnx', '.engine')) # TensorRT engine file
logger = trt.Logger(trt.Logger.VERBOSE if verbose else trt.Logger.INFO)
# Engine builder
builder = trt.Builder(logger)
config = builder.create_builder_config()
workspace = int(n_workspace * (1 << 30))
if is_trt10:
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace)
else: # TensorRT versions 7, 8
config.max_workspace_size = workspace
flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
network = builder.create_network(flag)
half = builder.platform_has_fast_fp16 and is_half
int8 = builder.platform_has_fast_int8 and is_int8
# Read ONNX file
parser = trt.OnnxParser(network, logger)
if not parser.parse_from_file(f_onnx):
raise RuntimeError(f"failed to load ONNX file: {f_onnx}")
# Network inputs
inputs = [network.get_input(i) for i in range(network.num_inputs)]
outputs = [network.get_output(i) for i in range(network.num_outputs)]
for inp in inputs:
print(f'{prefix} input "{inp.name}" with shape{inp.shape} {inp.dtype}')
for out in outputs:
print(f'{prefix} output "{out.name}" with shape{out.shape} {out.dtype}')
if dynamic:
shape = torch.zeros(batch, 3, *imgsz).shape
if shape[0] <= 1:
print(f"{prefix} WARNING ⚠️ 'dynamic=True' model requires max batch size, i.e. 'batch=16'")
profile = builder.create_optimization_profile()
min_shape = (1, 3, *imgsz) # minimum input shape
max_shape = (64, 3, *imgsz) # max input shape
for inp in inputs:
profile.set_shape(inp.name, min=shape, opt=shape, max=shape)
config.add_optimization_profile(profile)
print(f"{prefix} building {'INT8' if int8 else 'FP' + ('16' if half else '32')} engine as {f}")
if int8:
config.set_flag(trt.BuilderFlag.INT8)
config.set_calibration_profile(profile)
config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
class EngineCalibrator(trt.IInt8Calibrator):
def __init__(self, dataloader, batch_size, cache_file=""):
trt.IInt8Calibrator.__init__(self)
self.dataloader = dataloader
self.data_iter = iter(self.dataloader)
self.batch_size = batch_size
self.cache_file = cache_file
def get_algorithm(self):
return trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2
def get_batch_size(self):
return self.batch_size
def get_batch(self, names):
try:
batch = next(self.data_iter)
batch = batch.to('cuda')
return [int(batch.data_ptr())]
except StopIteration:
return None
def read_calibration_cache(self):
if os.path.exists(self.cache_file):
with open(self.cache_file, 'rb') as f:
return f.read()
return None
def write_calibration_cache(self, cache):
with open(self.cache_file, 'wb') as f:
f.write(cache)
# Load dataset w/ builder (for batching) and calibrate
config.int8_calibrator = EngineCalibrator(
dataloader=get_int8_calibration_dataloader(data, imgsz, 2 * batch),
batch_size=2 * batch,
cache_file=str(f_onnx.replace('.onnx', '.cache')),
)
elif half:
config.set_flag(trt.BuilderFlag.FP16)
# Free CUDA memory
torch.cuda.empty_cache()
# Write file
build = builder.build_serialized_network if is_trt10 else builder.build_engine
with build(network, config) as engine, open(f, "wb") as t:
# Model
t.write(engine if is_trt10 else engine.serialize())
return f, None
def main():
"""Create a TensorRT engine for ONNX-based YOLOv8 and run inference."""
opt = Opts().parse()
export_engine(
f_onnx=opt.f_onnx,
savedir=opt.savedir,
is_half=opt.fp16,
is_int8=opt.int8,
data=opt.data,
imgsz=tuple(opt.imgsz),
dynamic=opt.dynamic,
batch=opt.batch,
n_workspace=opt.workspace,
verbose=opt.verbose,
)
if __name__ == '__main__':
main()
When I run this code the export to FP32 or FP16 engine works fine, but doesn't work with the INT8 option and gives the following error:
root@618fe51132ed:/workspace# python3 quantization/mainv2.py -imgsz 640 640 -int8
TensorRT: starting export with TensorRT 8.6.1...
[06/06/2024-14:41:42] [TRT] [I] [MemUsageChange] Init CUDA: CPU +2, GPU +0, now: CPU 114, GPU 139 (MiB)
[06/06/2024-14:41:45] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1445, GPU +268, now: CPU 1635, GPU 407 (MiB)
/workspace/quantization/mainv2.py:98: DeprecationWarning: Use set_memory_pool_limit instead.
config.max_workspace_size = workspace
[06/06/2024-14:41:45] [TRT] [I] ----------------------------------------------------------------
[06/06/2024-14:41:45] [TRT] [I] Input filename: /models/model.onnx
[06/06/2024-14:41:45] [TRT] [I] ONNX IR version: 0.0.8
[06/06/2024-14:41:45] [TRT] [I] Opset version: 17
[06/06/2024-14:41:45] [TRT] [I] Producer name: pytorch
[06/06/2024-14:41:45] [TRT] [I] Producer version: 2.3.0
[06/06/2024-14:41:45] [TRT] [I] Domain:
[06/06/2024-14:41:45] [TRT] [I] Model version: 0
[06/06/2024-14:41:45] [TRT] [I] Doc string:
[06/06/2024-14:41:45] [TRT] [I] ----------------------------------------------------------------
[06/06/2024-14:41:45] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
TensorRT: input "images" with shape(-1, 3, -1, -1) DataType.FLOAT
TensorRT: output "output0" with shape(-1, 84, -1) DataType.FLOAT
TensorRT: building INT8 engine as /output/model.engine
TensorRT: collecting INT8 calibration images from 'data=/images'
/workspace/quantization/mainv2.py:186: DeprecationWarning: Use build_serialized_network instead.
with build(network, config) as engine, open(f, "wb") as t:
[06/06/2024-14:41:45] [TRT] [I] Graph optimization time: 0.0022734 seconds.
[06/06/2024-14:41:45] [TRT] [I] Timing cache disabled. Turning it on will improve builder speed.
[06/06/2024-14:41:46] [TRT] [I] Detected 1 inputs and 3 output network tensors.
[06/06/2024-14:41:46] [TRT] [I] Total Host Persistent Memory: 334496
[06/06/2024-14:41:46] [TRT] [I] Total Device Persistent Memory: 1645056
[06/06/2024-14:41:46] [TRT] [I] Total Scratch Memory: 0
[06/06/2024-14:41:46] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 1 MiB, GPU 150 MiB
[06/06/2024-14:41:46] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 308 steps to complete.
[06/06/2024-14:41:46] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 57.1936ms to assign 28 blocks to 308 nodes requiring 206504448 bytes.
[06/06/2024-14:41:46] [TRT] [I] Total Activation Memory: 206504448
[06/06/2024-14:41:46] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +199, now: CPU 0, GPU 213 (MiB)
[06/06/2024-14:41:46] [TRT] [I] Starting Calibration.
[06/06/2024-14:41:46] [TRT] [E] 1: [calibrator.cu::absTensorMax::146] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:46] [TRT] [E] 1: [calibrator.cu::absTensorMax::146] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:46] [TRT] [E] 1: [calibrator.cu::absTensorMax::146] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:46] [TRT] [E] 1: [calibrator.cu::absTensorMax::146] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:46] [TRT] [E] 1: [convBaseRunner.cpp::execute::295] Error Code 1: Cask (Cask convolution execution)
[06/06/2024-14:41:46] [TRT] [E] 1: [checkMacros.cpp::catchCudaError::203] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:47] [TRT] [E] 3: [engine.cpp::~Engine::298] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/engine.cpp::~Engine::298, condition: mExecutionContextCounter.use_count() == 1. Destroying an engine object before destroying the IExecutionContext objects it created leads to undefined behavior.
)
[06/06/2024-14:41:47] [TRT] [E] 2: [calibrator.cpp::calibrateEngine::1181] Error Code 2: Internal Error (Assertion context->executeV2(&bindings[0]) failed. )
Traceback (most recent call last):
File "/workspace/quantization/mainv2.py", line 211, in <module>
main()
File "/workspace/quantization/mainv2.py", line 197, in main
export_engine(
File "/workspace/quantization/mainv2.py", line 186, in export_engine
with build(network, config) as engine, open(f, "wb") as t:
AttributeError: __enter__
I only changed the way YOLOv8 create their dataloader as I went for a basic dataloader, I don't know if that's where the error comes from & I can't find a solution for this, any help? Thank you
When I run this code the export to FP32 or FP16 engine works fine, but doesn't work with the INT8 option and gives the following error:
You should check the calib data path/file right or not. You can try to use torch.randn to generate calib data.
On the other way, if is not file/path issue, maybe CUDA context issue. Usually we do not use torch and pycuda simultaneously.
When I run this code the export to FP32 or FP16 engine works fine, but doesn't work with the INT8 option and gives the following error:
You should check the calib data path/file right or not. You can try to use
torch.randnto generate calib data.On the other way, if is not file/path issue, maybe CUDA context issue. Usually we do not use torch and pycuda simultaneously.
I had same Problem, and followed your advice to not use cuda and torch together as it is in yolo implementation, so I read the data as numpy array and have this implementation for get_batch where my batch_size = 1
def get_batch(self, names):
"""get batch of the calibration"""
try:
input_np = next(self.data_iter)
# byte data of input vecor has to be on cuda (gpu/jetson orin)
cuda.memcpy_htod(self.device_input, input_np.data)
return [int(self.device_input)]
except StopIteration:
return None # if no more data in datast return None
to init self.device_input used this line in init
# reserve enough bytes on gpu/jetson orin to copy data on it for calibration
self.device_input = cuda.mem_alloc(self.dataset[0].nbytes * self.batch)
@labderrafie are you still encountering this issue with TRT 10?
We recommend using ModelOpt for model quantization now: https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/examples/onnx_ptq/README.md
Closing due to inactive. Please feel free to reopen!