djl
djl copied to clipboard
TorchScript inference slower than default torch model
I found that the inference speed in TorchScript(converted by model_zoo_importer.py) slower than in default PyTorch format.
from sentence_transformers import CrossEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import time
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
model = CrossEncoder("BAAI/bge-reranker-v2-m3", max_length=512)
model.predict(pairs);
print("model prewarmed")
start_time = time.time();
for _ in range(10):
model.predict(pairs)
end_time = time.time()
execution_time = end_time - start_time
print("execution_time1: ", execution_time)
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
model1 = torch.load("bge-reranker-v2-m3.pt")
print("model1 prewarmed")
start_time = time.time();
for _ in range(10):
with torch.no_grad():
scores = model1(**inputs)['logits'].view(-1, ).float()
end_time = time.time()
execution_time = end_time - start_time
print("execution_time2: ", execution_time)
execution_time1: 0.4500548839569092 execution_time2: 2.066737174987793
Both in 'cpu' and 'cuda', the .pt model takes 4-5 times as long as the source model.
I'm not able to reproduce your case, I use the following code:
import os.path
import time
import torch
from sentence_transformers import CrossEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer
def main():
pairs = [['what is panda?', 'hi'],
['what is panda?',
'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
model.eval()
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
model_file = "bge-reranker-v2-m3.pt"
if os.path.exists(model_file):
traced_model = torch.jit.load(model_file)
traced_model.eval()
else:
traced_model = torch.jit.trace(model, (input_ids, attention_mask),
strict=False)
traced_model.save(model_file)
# warmup
with torch.no_grad():
traced_model(input_ids, attention_mask)
start_time = time.time()
with torch.no_grad():
for _ in range(10):
traced_model(input_ids, attention_mask)
execution_time = time.time() - start_time
print("traced model: ", execution_time)
# warmup
with torch.no_grad():
model(**inputs)
start_time = time.time()
with torch.no_grad():
for _ in range(10):
model(**inputs)
execution_time = time.time() - start_time
print("huggingface model: ", execution_time)
model = CrossEncoder("BAAI/bge-reranker-v2-m3", max_length=512)
model.predict(pairs)
start_time = time.time()
for _ in range(10):
model.predict(pairs)
execution_time = time.time() - start_time
print("CrossEncoder: ", execution_time)
if __name__ == '__main__':
main()
traced model: 2.593564033508301
huggingface model: 2.4144911766052246
CrossEncoder: 2.5312209129333496
@frankfliu I modified your script to enable it running on CUDA but got an error message
import os.path
import time
import torch
from sentence_transformers import CrossEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer
def main():
pairs = [['what is panda?', 'hi'],
['what is panda?',
'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3').to('cuda')
model.eval()
input_ids = inputs["input_ids"].to('cuda')
attention_mask = inputs["attention_mask"].to('cuda')
inputs = inputs.to('cuda')
model_file = "bge-reranker-v2-m3.pt"
traced_model = torch.jit.trace(model, (input_ids, attention_mask),
strict=False)
traced_model.save(model_file)
# warmup
with torch.no_grad():
traced_model(input_ids, attention_mask)
start_time = time.time()
with torch.no_grad():
for _ in range(10):
traced_model(input_ids, attention_mask)
execution_time = time.time() - start_time
print("traced model: ", execution_time)
# warmup
with torch.no_grad():
model(**inputs)
start_time = time.time()
with torch.no_grad():
for _ in range(10):
model(**inputs)
execution_time = time.time() - start_time
print("huggingface model: ", execution_time)
model = CrossEncoder("BAAI/bge-reranker-v2-m3", max_length=512, device='cpu')
model.predict(pairs)
start_time = time.time()
for _ in range(10):
model.predict(pairs)
execution_time = time.time() - start_time
print("CrossEncoder: ", execution_time)
if __name__ == '__main__':
main()
RuntimeError: default_program(24): error: extra text after expected end of number
aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
^
default_program(28): error: extra text after expected end of number
aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
^
2 errors detected in the compilation of "default_program".
nvrtc compilation failed:
#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)
template<typename T>
__device__ T maximum(T a, T b) {
return isnan(a) ? a : (a > b ? a : b);
}
template<typename T>
__device__ T minimum(T a, T b) {
return isnan(a) ? a : (a < b ? a : b);
}
extern "C" __global__
void fused_mul_div_add(float* tattention_scores_1, float* tv_, float* aten_add, float* aten_mul) {
{
if (blockIdx.x<1ll ? 1 : 0) {
if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<86ll ? 1 : 0) {
if (blockIdx.x<1ll ? 1 : 0) {
float v = __ldg(tv_ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
} } }if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<59168ll ? 1 : 0) {
float v_1 = __ldg(tattention_scores_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
float v_2 = __ldg(tv_ + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) % 43ll + 43ll * (((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 29584ll));
aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
}}
}
Environment
OS: Linux
nvidia-cuda-runtime-cu12 12.1.105 nvidia-cudnn-cu12 8.9.2.26 torch 2.3.0
This is a known bug in torchscript in PyTorch 2.x, Please try PyTorch 1.13.1 in GPU.
You best option to run text-embedding is to convert it to onnx: see: https://github.com/deepjavalibrary/djl/blob/master/extensions/tokenizers/src/main/python/djl_converter/huggingface_converter.py#L63