TensorRT
TensorRT copied to clipboard
use tensorrt inference bert, speed slow than onnxruntime
Use tensorrt inference bert, speed slow than onnxruntime,tensorrt is 10ms,onnx is 6ms,model just simple bert classification model. Could some one help me? onnx code
import numpy as np
import onnxruntime as ort
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
from fastapi import FastAPI, Request
import uvicorn
import time
app = FastAPI()
label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game", "entertainment", ]
pretrained_bert_dir = "/var/log/model_repository/bert_classification_v1/1/"
tokenizer = BertTokenizer.from_pretrained(pretrained_bert_dir)
bert_config = BertConfig.from_pretrained(pretrained_bert_dir, num_labels=len(label_list))
sess = ort.InferenceSession(pretrained_bert_dir+'model.onnx', providers=['CUDAExecutionProvider'])
@app.get("/predictSingle")
def query(q):
costs = []
t0 = time.time()
t1 = time.time()
inputs = tokenizer(q, max_length=32, padding="max_length", truncation="longest_first", return_tensors="pt")
costs.append(f"token={time.time() - t1}")
t1 = time.time()
input_dict = {"input_ids": inputs["input_ids"].numpy(), "token_type_ids": inputs["token_type_ids"].numpy(),
"attention_mask": inputs["attention_mask"].numpy()}
outs = sess.run(None, input_dict)
num = np.argmax(outs)
costs.append(f"trt={time.time() - t1}")
costs.append(f"all={time.time() - t0}")
return {f"predictions={label_list[num]} cost={costs}"}
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--port', default="8582", help='port')
args, _ = parser.parse_known_args()
print(args.port)
uvicorn.run(app, host='0.0.0.0', port=int(args.port), workers=1)
tensorrt code
from fastapi import FastAPI, HTTPException
import tensorrt as trt
from transformers import BertTokenizer
import torch
import numpy as np
import uvicorn
import pycuda.driver as cuda
import collections
import time
import pycuda.autoinit
from flask import Flask, request
import os
import ctypes
app = Flask(__name__)
handle = ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
if not handle:
raise RuntimeError(
"Could not load plugin library.")
_Feature = collections.namedtuple( # pylint: disable=invalid-name
"Feature",
["input_ids", "attention_mask", "token_type_ids", "batch_size"])
# app = FastAPI()
max_seq_length=32
class TRTModule(torch.nn.Module):
def __init__(self, engine=None, input_names=None, output_names=None):
super(TRTModule, self).__init__()
# self._register_state_dict_hook(TRTModule._on_state_dict)
self.engine = engine
if self.engine is not None:
self.context = self.engine.create_execution_context()
self.input_names = input_names
self.output_names = output_names
def forward(self, features):
outputs = []
# print(features[0])
batch_size = features[0].batch_size
# batch_size = 2
print(f"batch_size={batch_size}")
# We always use batch size 1.
input_shape = (batch_size, max_seq_length)
input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
stream = cuda.Stream()
for binding in range(3):
self.context.set_binding_shape(binding, input_shape)
assert self.context.all_binding_shapes_specified
# Allocate output buffer by querying the size from the context. This may be different for different input shapes.
h_output = cuda.pagelocked_empty(tuple(self.context.get_binding_shape(3)), dtype=np.float32)
d_output = cuda.mem_alloc(h_output.nbytes)
eval_time_elapsed = 0
for feature_index, feature in enumerate(features):
# Register host memory
input_ids = cuda.register_host_memory(np.ascontiguousarray(feature.input_ids.ravel()))
segment_ids = cuda.register_host_memory(np.ascontiguousarray(feature.token_type_ids.ravel()))
input_mask = cuda.register_host_memory(np.ascontiguousarray(feature.attention_mask.ravel()))
try:
eval_start_time = time.time()
cuda.memcpy_htod_async(d_inputs[engine.get_binding_index("input_ids")], input_ids, stream)
cuda.memcpy_htod_async(d_inputs[engine.get_binding_index("token_type_ids")], segment_ids, stream)
cuda.memcpy_htod_async(d_inputs[engine.get_binding_index("attention_mask")], input_mask, stream)
# Run inference
self.context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)],
stream_handle=stream.handle)
# Synchronize the stream
stream.synchronize()
eval_time_elapsed += (time.time() - eval_start_time)
# Transfer predictions back from GPU
cuda.memcpy_dtoh_async(h_output, d_output, stream)
stream.synchronize()
for index, batch in enumerate(h_output):
# Data Post-processing
predicted_classes = np.argmax(batch, axis=0)
print(predicted_classes)
# selected_labels = [label_list[i] for i in predicted_classes]
print({f"predictions={label_list[predicted_classes]}"})
outputs.append(label_list[predicted_classes])
finally:
# Unregister host memory
# 删除变量并显式调用垃圾收集器。
del input_ids
del segment_ids
del input_mask
return outputs
label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game", "entertainment", ]
dir = "/var/log/model_repository/bert_classification_tensorrt_843/1/"
engine_file_path = dir+"model.plan"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
logger = trt.Logger(trt.Logger.INFO)
model_all_names = []
with open(engine_file_path, "rb") as f, trt.Runtime(logger) as runtime:
engine=runtime.deserialize_cuda_engine(f.read())
trt_model = TRTModule(engine, ["input_ids", "attention_mask", "token_type_ids"],['logits']).to(device)
tokenizer = BertTokenizer.from_pretrained(dir)
@app.get('/predictSingle')
def predict():
try:
costs = []
t0 = time.time()
t1 = time.time()
sentences = ["股票情况", "大学教授"]
data_org = tokenizer(sentences,
max_length=32,
padding="max_length",
truncation=True,
return_tensors="pt")
costs.append(f"token={(time.time() - t1):.3f}")
t1 = time.time()
# 转换为int32位
data = {k: torch.tensor(v, dtype=torch.int32) for k, v in data_org.items()}
input_ids = data['input_ids']
attention_mask = data['attention_mask']
token_type_ids = data['token_type_ids']
features = []
features.append(_Feature(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
batch_size=len(sentences)
))
outputs = trt_model(features)
costs.append(f"trt={(time.time() - t1):.3f}")
costs.append(f"all={(time.time() - t0):.3f}")
return f"predictions={outputs} costs={costs}"
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--port', default="8580", help='port')
args, _ = parser.parse_known_args()
print(args.port)
app.run(host='0.0.0.0', port=int(args.port), threaded=False)
# uvicorn.run(app, host='0.0.0.0', port=int(args.port), workers=1)
How many iteration you are using, first few iteration will take longer time due to warm up GPU and initialization. I would highly recommend that use our trtexec tool to test the perf.
closing since no activity for more than 3 weeks, pls reopen if you still have question, thanks!
hi, can sentence-transformers e.g. https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 already used with tensorRT-LLM? my goal is to compile a sentence-transformers/all-MiniLM-L6-v2 model without quantization using tensorRT-LLM and serve with triton... are there any docs how to make the model ready for tensorRT as well as onnx? cc @ttyio @zerollzeng