Wrong results caused by using TensorrtExecutionProvider
tensorrt 8.6.1 optimum 1.14.1 onnx 1.15.0 onnx-graphsurgeon 0.3.12 onnxruntime-gpu 1.16.2
complete code:
import tensorrt
print(tensorrt.__version__)
import argparse
import os
import time
import torch
from PIL import Image
from transformers import VisionEncoderDecoderModel,DonutSwinModel,AutoConfig,pipeline
from transformers.models.nougat import NougatTokenizerFast
from nougat_latex.util import process_raw_latex_code
from nougat_latex import NougatLaTexProcessor
from optimum.onnxruntime import ORTModelForVision2Seq ,ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from transformers import pipeline
from optimum.onnxruntime.configuration import OptimizationConfig
from optimum.onnxruntime import ORTModelForVision2Seq
from transformers import AutoTokenizer
device = torch.device("cuda:0")
os.makedirs("./examples/tensorrt2", exist_ok=True)
provider_options = {
"trt_engine_cache_enable": True,
"trt_engine_cache_path": "./examples/tensorrt2"
}
t0 = time.time()
# ort_model = ORTModelForVision2Seq.from_pretrained('./examples/onnx2').to(device)
ort_model = ORTModelForVision2Seq.from_pretrained(
"./examples/onnx2",
use_cache=False,
provider="TensorrtExecutionProvider",
provider_options=provider_options,
)
t1 = time.time()
print("load model1 time = ", (t1 - t0))
t0 = time.time()
tokenizer = NougatTokenizerFast.from_pretrained(r'./examples/onnx2')
latex_processor = NougatLaTexProcessor.from_pretrained(r'./examples/onnx2')
t1 = time.time()
print("load model2 time = ", (t1 - t0))
t0 = time.time()
print("Building engine for a short sequence...")
image = Image.open('./test/0201099.png')
if not image.mode == "RGB":
image = image.convert('RGB')
pixel_values = latex_processor(image, return_tensors="pt").pixel_values
task_prompt = tokenizer.bos_token
decoder_input_ids = tokenizer(task_prompt, add_special_tokens=False,
return_tensors="pt").input_ids
t1 = time.time()
print("load model3 time = ", (t1 - t0))
t0 = time.time()
# output = ort_model.generate(**pixel_values)
outputs = ort_model.generate(
pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
max_length=800,
early_stopping=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
t1 = time.time()
print("load model4 time = ", (t1 - t0))
t0 = time.time()
print(outputs)
sequence = tokenizer.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(tokenizer.eos_token, "").replace(tokenizer.pad_token, "").replace(tokenizer.bos_token,"")
sequence = process_raw_latex_code(sequence)
t1 = time.time()
print("load model5 time = ", (t1 - t0))
print(sequence)
- First question,When I use
ort_model = ORTModelForVision2Seq.from_pretrained('./examples/onnx2').to(device)
The output is:
load model1 time = 21.829745054244995 load model2 time = 0.11319541931152344 Building engine for a short sequence... load model3 time = 0.12468218803405762 /home/kas/.conda/envs/torch/lib/python3.8/site-packages/transformers/generation/utils.py:1473: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration ) warnings.warn( /home/kas/.conda/envs/torch/lib/python3.8/site-packages/transformers/generation/configuration_utils.py:418: UserWarning:
num_beamsis set to 1. However,early_stoppingis set toTrue-- this flag is only used in beam-based generation modes. You should setnum_beams>1or unsetearly_stopping. warnings.warn( load model4 time = 4.698873281478882 GreedySearchEncoderDecoderOutput(sequences=tensor([[ 0, 90, 763, 84, 113, 40, 115, 33, 90, 627, 84, 113, 40, 115, 82, 4782, 309, 340, 84, 113, 40, 115, 2]], device='cuda:0'), scores=None, encoder_attentions=None, encoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, decoder_hidden_states=None) load model5 time = 0.0027070045471191406 d x^{2}+d y^{2}\neq d l^{2}
The result is correct. But when I use
ort_model = ORTModelForVision2Seq.from_pretrained(
"./examples/onnx2",
use_cache=False,
provider="TensorrtExecutionProvider",
provider_options=provider_options,
)
Only the prediction result of the first character is output
load model1 time = 33.12913107872009 load model2 time = 0.08635807037353516 Building engine for a short sequence... load model3 time = 0.11843752861022949 /home/kas/.conda/envs/torch/lib/python3.8/site-packages/transformers/generation/utils.py:1473: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration ) warnings.warn( /home/kas/.conda/envs/torch/lib/python3.8/site-packages/transformers/generation/configuration_utils.py:418: UserWarning:
num_beamsis set to 1. However,early_stoppingis set toTrue-- this flag is only used in beam-based generation modes. You should setnum_beams>1or unsetearly_stopping. warnings.warn( load model4 time = 4.253650903701782 GreedySearchEncoderDecoderOutput(sequences=tensor([[ 0, 90, 2]], device='cuda:0'), scores=None, encoder_attentions=None, encoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, decoder_hidden_states=None) load model5 time = 0.002318143844604492 d
- Another problem is that some pictures will have a lot of warnings during the model.generate process, and it will take a lot of time.
WARNING] BuilderFlag::kENABLE_TACTIC_HEURISTIC has been ignored in this builder run. This feature is only supported on Ampere and beyond.