the output of onnx model is different from model inferenced by TensorRT
Description
I attempted to compile a Hugging Face model (the Hugging Face model link is: https://huggingface.co/OpenGVLab/InternViT-6B-448px-V1-5, which includes both the model architecture code and model files) using TensorRT (TRT) to improve inference speed. The steps I followed are hf -> onnx -> trt.
I performed inference on the same image using Hugging Face (hf), ONNX, and TRT engine. I found that the inference results from hf and ONNX were consistent, but the TRT engine's result was different from the former two.
I would like to know why the ONNX results are correct, but the inference results from the engine compiled with trtexec are wrong. Why is this happening?
The conversion code from hf to ONNX is:
import torch
from transformers import AutoModel
from typing import List,Dict
import time
import os
import warnings
class InternVL2VisionWrapper(torch.nn.Module):
def __init__(self, model, ):
super().__init__()
self.model = model
def forward(self, pixel_values):
features = self.model(pixel_values).last_hidden_state
return features
model_path="/media/star/disk2/pretrained_model/InternViT/InternViT-6B-448px-V1-5"
output_dir="/media/star/8T/tmp/InternViT-6B-448px-V1-5/onnx"
os.makedirs(f'{output_dir}', exist_ok=True)
model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
trust_remote_code=True).cuda().eval()
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
wrapper = InternVL2VisionWrapper(model=model.to(device))
dummy_image = torch.empty(1,
3,
448,
448,
dtype=torch.float16,
device=device) # dummy image
def export_visual_wrapper_onnx(visual_wrapper,
input,
output_dir,
input_names=['input'],
dynamic_axes={'input': {0: 'batch'}, 'output': {0: 'batch'}}
):
torch.onnx.export(visual_wrapper,
input,
f'{output_dir}/visual_encoder.onnx',
opset_version=17,
input_names=input_names,
output_names=['output'],
dynamic_axes=dynamic_axes)
export_visual_wrapper_onnx(wrapper, dummy_image, output_dir)
The conversion code from ONNX to TRT engine is:
MODEL_NAME="InternVL2-40B"
OUTPUT_MODEL_NAME="InternVL2_40B"
onnx_process_version="onnx_v1"
max_batch_size=24
onnx_dtype="float16"
trt_dtype="best"
/usr/src/tensorrt/bin/trtexec \
--onnx=/data/eas/visual_engine/a100/InternViT-6B-448px-V1-5/onnx/visual_encoder.onnx \
--saveEngine=/data/eas/visual_engine/a100/InternViT-6B-448px-V1-5/visual_encoder.trtexec.${trt_dtype}.maxBatchSize${max_batch_size}.engine \
--minShapes=input:1x3x448x448 \
--optShapes=input:8x3x448x448 \
--maxShapes=input:24x3x448x448 \
--best
The inference code for hf is:
import torch
import time
from transformers import AutoModel, CLIPImageProcessor
import numpy as np
model_path="/media/star/disk2/pretrained_model/InternViT/InternViT-6B-448px-V1-5"
torch_dtype=torch.float16
model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
trust_remote_code=True).cuda().eval()
pixel_values = torch.tensor(np.load("/media/star/8T/tmp/internvn2_40b_image2_patch1.npy")).to(torch_dtype).cuda()
print(f"pixel_values.shape={pixel_values.shape}")
pixel_values=torch.concat([pixel_values], dim=0)
start_time=time.time()
outputs = model(pixel_values)
print(f"run time is {time.time()-start_time} seconds")
print(f"outputs.last_hidden_state={outputs.last_hidden_state}")
print(f"outputs.last_hidden_state.shape={outputs.last_hidden_state.shape}")
The inference code for ONNX is:
import os
import torch
import numpy as np
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from PIL import Image
import time
import onnxruntime
from transformers import CLIPImageProcessor
if __name__ == '__main__':
img_sess_options = onnxruntime.SessionOptions()
img_run_options = onnxruntime.RunOptions()
img_run_options.log_severity_level = 2
img_onnx_model_path = "/media/star/8T/tmp/InternViT-6B-448px-V1-5/onnx/visual_encoder.onnx"
pixel_values = torch.tensor(np.load("/media/star/8T/tmp/internvn2_40b_image2_patch1.npy")).to(torch.float16)
print(f"pixel_values.shape={pixel_values.shape}")
img_session = onnxruntime.InferenceSession(img_onnx_model_path,
sess_options=img_sess_options,
providers=["CUDAExecutionProvider"])
onnx_image_features = img_session.run(["output"], {"input": pixel_values.cpu().numpy()})[
0]
onnx_image_features = torch.tensor(onnx_image_features).cuda()
print(f"onnx_image_features.shape={onnx_image_features.shape}")
print(f"onnx_image_features={onnx_image_features}")
The inference code for TRT engine is:
import tensorrt_llm
from tensorrt_llm import logger
from tensorrt_llm._utils import str_dtype_to_trt
from tensorrt_llm.runtime import Session, TensorInfo
import os
import torch
import tensorrt as trt
import numpy as np
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from PIL import Image
import time
from transformers import AutoModel,AutoTokenizer
import math
def trt_dtype_to_torch(dtype):
if dtype == trt.float16:
return torch.float16
elif dtype == trt.float32:
return torch.float32
elif dtype == trt.int32:
return torch.int32
elif dtype == trt.bfloat16:
return torch.bfloat16
else:
raise TypeError("%s is not supported" % dtype)
vision_precision="float16"
if vision_precision=="float16":
torch_dtype = torch.float16
else:
torch_dtype = torch.float
start_time=time.time()
image=torch.tensor(np.load("/data/eas/ndarray/internvn2_40b_image2_patch1.npy")).to(torch_dtype).cuda()
print(f"load image time is {time.time()-start_time} seconds")
# print(f"image pixels={image}")
print(f"origin image.shape={image.shape}")
image=torch.concat([image]*1, dim=0)
print(f"new image.shape={image.shape}")
iter_num=1
#=======================================================================================================================
attention_mask=None
stream = torch.cuda.Stream(torch.cuda.current_device())
torch.cuda.set_stream(stream)
vision_encoder_path="/data/eas/visual_engine/a100/InternViT-6B-448px-V1-5/visual_encoder.trtexec.best.maxBatchSize24.engine"
logger.info(f'Loading engine from {vision_encoder_path}')
with open(vision_encoder_path, 'rb') as f:
engine_buffer = f.read()
logger.info(f'Creating session from engine {vision_encoder_path}')
visual_encoder_session = Session.from_serialized_engine(
engine_buffer)
visual_features = {'input':image.to(tensorrt_llm._utils.str_dtype_to_torch(vision_precision))}
if attention_mask is not None:
visual_features['attention_mask'] = attention_mask
tensor_info = [TensorInfo('input', str_dtype_to_trt(vision_precision),image.shape)]
if attention_mask is not None:
tensor_info.append(TensorInfo('attention_mask', trt.DataType.INT32,attention_mask.shape))
visual_output_info = visual_encoder_session.infer_shapes(tensor_info)
visual_outputs = {
t.name: torch.empty(tuple(t.shape),
dtype=trt_dtype_to_torch(t.dtype),
device=image.device)
for t in visual_output_info
}
start_time=time.time()
for _ in range(iter_num):
ok = visual_encoder_session.run(visual_features, visual_outputs, stream.cuda_stream)
assert ok, "Runtime execution failed for vision encoder session"
stream.synchronize()
print(f"tensorrt,single image run time is {round((time.time()-start_time)/(iter_num),2)} seconds")
outputs_trt = visual_outputs['output']
outputs_trt=torch.squeeze(outputs_trt)
print(f"outputs_trt.shape={outputs_trt.shape}")
print(f"outputs_trt={outputs_trt}")
The inference results are as follows:
InternViT-6B-448px-V1-5
hf
outputs.last_hidden_state=tensor([[[ 1.0576, -4.4062, 1.1816, ..., 0.4963, 0.5752, 0.4436],
[ 3.6680, 4.8086, 4.7578, ..., -14.2969, 6.4336, -12.0312],
[ 3.9355, 4.4805, 4.4922, ..., -14.7031, 6.2812, -11.2266],
...,
[ -2.5684, -2.8164, 5.3242, ..., -7.0508, 0.2556, -6.5859],
[ -6.5156, -6.5859, 9.9531, ..., -4.0938, -4.5703, -14.6719],
[ -6.2383, -6.2930, 10.0391, ..., -3.8965, -4.2891, -15.1016]]],
device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)
outputs.last_hidden_state.shape=torch.Size([1, 1025, 3200])
onnx
onnx_image_features=tensor([[[ 1.0508, -4.3672, 1.1572, ..., 0.4902, 0.5601, 0.4417],
[ 3.6680, 4.8086, 4.7539, ..., -14.2969, 6.4180, -12.0391],
[ 3.9316, 4.4805, 4.4805, ..., -14.6953, 6.2656, -11.2188],
...,
[ -2.5840, -2.8320, 5.3555, ..., -7.0664, 0.2539, -6.6055],
[ -6.5195, -6.5898, 9.9375, ..., -4.0977, -4.5781, -14.6641],
[ -6.2461, -6.2891, 10.0234, ..., -3.9062, -4.2930, -15.0938]]],
device='cuda:0', dtype=torch.float16)
trt_exec
outputs_trt=tensor([[ 1.6309, -3.6855, 2.2578, ..., 0.0575, 1.1006, -0.1124],
[ 4.1562, 3.1484, 5.3594, ..., -10.8594, 6.0859, -10.5625],
[ 4.8984, 2.9824, 5.1719, ..., -11.0938, 6.1602, -9.5312],
...,
[ -0.2856, -3.4414, 5.4531, ..., -3.5000, 0.0735, -5.1602],
[ -3.1660, -5.7734, 8.8047, ..., -4.5742, -3.1387, -10.2969],
[ -3.0371, -5.5156, 8.8984, ..., -4.5469, -3.0234, -10.8906]],
device='cuda:0', dtype=torch.float16)
Environment
TensorRT Version:v100500
NVIDIA GPU:A100
NVIDIA Driver Version:535.54.03
CUDA Version:12.2
CUDNN Version:8920
Operating System: docker mirror: nvidia_cuda_12.4.0-devel-ubuntu22.04
Python Version (if applicable):3.10.12
Tensorflow Version (if applicable):
PyTorch Version (if applicable):2.2.2+cu121
Baremetal or Container (if so, version):docker
Relevant Files
Model link: https://huggingface.co/OpenGVLab/InternViT-6B-448px-V1-5 internvn2_40b_image2_patch1.npy: internvn2_40b_image2_patch1.zip onnx file link: https://drive.google.com/file/d/1lnEmuQ4cNzf8YA7ddznqUnYsz-W5y5aJ/view?usp=sharing
@rajeevsrao @ttyio @pranavm-nvidia @aaronp24 @ilyasher Could you please take a look at this issue?
The problem is that trtexec will use random scaling factors for int8 mode. If you replace --best with --fp16 (i.e. disable --int8), that should improve the accuracy.
The problem is that
trtexecwill use random scaling factors forint8mode. If you replace--bestwith--fp16(i.e. disable--int8), that should improve the accuracy.
@pranavm-nvidia
Thanks for your reply.
I recompiled the engine using the code below, but the inference results from the TensorRT engine are still different from those of Hugging Face. I configured FP16 and did not specify INT8. In this case, INT8 should be disabled. So why are the results still different?
MODEL_NAME="InternVL2-40B"
OUTPUT_MODEL_NAME="InternVL2_40B"
onnx_process_version="onnx_v1"
max_batch_size=24
onnx_dtype="float16"
trt_dtype="fp16"
/usr/src/tensorrt/bin/trtexec \
--onnx=/data/eas/visual_engine/a100/InternViT-6B-448px-V1-5/onnx/visual_encoder.onnx \
--saveEngine=/data/eas/visual_engine/a100/InternViT-6B-448px-V1-5/visual_encoder.trtexec.${trt_dtype}.maxBatchSize${max_batch_size}.engine \
--minShapes=input:1x3x448x448 \
--optShapes=input:8x3x448x448 \
--maxShapes=input:24x3x448x448 \
--fp16
result
hf+float16
outputs.last_hidden_state=tensor([[[ 1.0576, -4.4062, 1.1816, ..., 0.4963, 0.5752, 0.4436],
[ 3.6680, 4.8086, 4.7578, ..., -14.2969, 6.4336, -12.0312],
[ 3.9355, 4.4805, 4.4922, ..., -14.7031, 6.2812, -11.2266],
...,
[ -2.5684, -2.8164, 5.3242, ..., -7.0508, 0.2556, -6.5859],
[ -6.5156, -6.5859, 9.9531, ..., -4.0938, -4.5703, -14.6719],
[ -6.2383, -6.2930, 10.0391, ..., -3.8965, -4.2891, -15.1016]]],
device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)
outputs.last_hidden_state.shape=torch.Size([1, 1025, 3200])
tensorrt fp16
outputs_trt.shape=torch.Size([1025, 3200])
outputs_trt=tensor([[ 1.6475, -3.5586, 2.3145, ..., 0.0755, 0.9883, -0.1611],
[ 4.3867, 3.2012, 5.6523, ..., -10.9531, 6.0000, -10.7500],
[ 5.1836, 3.0391, 5.4883, ..., -11.1641, 6.1055, -9.7188],
...,
[ -0.2261, -3.4922, 5.6211, ..., -4.0312, 0.1794, -5.6328],
[ -3.2559, -5.6719, 8.9219, ..., -4.8320, -3.1484, -11.0000],
[ -3.1094, -5.3867, 9.0156, ..., -4.7812, -3.0059, -11.5312]],
same issue, you can set flash_attn to false and use bf16 to compile, it works for me
same issue, you can set flash_attn to false and use bf16 to compile, it works for me
@seanxcwang
I followed the method you provided for testing. In the hf -> onnx step, I set use_flash_attn=False and loaded the model with torch.bfloat16.
model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=False,
trust_remote_code=True).cuda().eval()
In the onnx -> trt stage, I tried both --fp16 and --best settings, but the result was the same: the difference between TRT and ONNX inference results remains significant.
MODEL_NAME="InternVL2-40B"
OUTPUT_MODEL_NAME="InternVL2_40B"
onnx_process_version="onnx_v1"
max_batch_size=24
onnx_dtype="float16"
trt_dtype="best"
/usr/src/tensorrt/bin/trtexec \
--onnx=/data/eas/visual_engine/a100/InternViT-6B-448px-V1-5/bfloat16/onnx/visual_encoder.onnx \
--saveEngine=/data/eas/visual_engine/a100/InternViT-6B-448px-V1-5/bfloat16/visual_encoder.trtexec.${trt_dtype}.maxBatchSize${max_batch_size}.engine \
--minShapes=input:1x3x448x448 \
--optShapes=input:8x3x448x448 \
--maxShapes=input:24x3x448x448 \
--${trt_dtype}
Did you compile following these steps?
I found that bfloat16 is not required, but use_flash_attn must be set to false when export onnx, and stronglyTyped should be added when convert to trt engine. by the way,I use python api to compile the model
@seanxcwang
I found that the following section of code in the Hugging Face model caused my TRT engine model export to be in the float32 format, which ensures that the inference results between TRT and HF remain consistent. If fp16 or best is configured, the results will not be consistent. However, inference with float32 is quite slow, so I am currently looking for a solution.
class InternRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
input_dtype = hidden_states.dtype
hidden_states = hidden_states.to(torch.float32)
variance = hidden_states.pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
return self.weight * hidden_states.to(input_dtype)