djl TorchScript inference slower than default torch model

I found that the inference speed in TorchScript(converted by model_zoo_importer.py) slower than in default PyTorch format.

from sentence_transformers import CrossEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import time

pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]

model = CrossEncoder("BAAI/bge-reranker-v2-m3", max_length=512)

model.predict(pairs);

print("model prewarmed")

start_time = time.time();

for _ in range(10):
	model.predict(pairs) 

end_time = time.time()

execution_time = end_time - start_time

print("execution_time1: ", execution_time)
	

tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
model1 = torch.load("bge-reranker-v2-m3.pt")

print("model1 prewarmed")

start_time = time.time();

for _ in range(10):
	with torch.no_grad():
		scores = model1(**inputs)['logits'].view(-1, ).float()

end_time = time.time()

execution_time = end_time - start_time

print("execution_time2: ", execution_time)

execution_time1: 0.4500548839569092 execution_time2: 2.066737174987793

May 11 '24 09:05 zaobao

Both in 'cpu' and 'cuda', the .pt model takes 4-5 times as long as the source model.

May 11 '24 09:05 zaobao

I'm not able to reproduce your case, I use the following code:

import os.path
import time

import torch
from sentence_transformers import CrossEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer


def main():
    pairs = [['what is panda?', 'hi'],
             ['what is panda?',
              'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]

    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)

    model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
    model.eval()

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    model_file = "bge-reranker-v2-m3.pt"
    if os.path.exists(model_file):
        traced_model = torch.jit.load(model_file)
        traced_model.eval()
    else:
        traced_model = torch.jit.trace(model, (input_ids, attention_mask),
                                       strict=False)
        traced_model.save(model_file)

    # warmup
    with torch.no_grad():
        traced_model(input_ids, attention_mask)

    start_time = time.time()

    with torch.no_grad():
        for _ in range(10):
            traced_model(input_ids, attention_mask)

    execution_time = time.time() - start_time
    print("traced model: ", execution_time)

    # warmup
    with torch.no_grad():
        model(**inputs)

    start_time = time.time()
    with torch.no_grad():
        for _ in range(10):
            model(**inputs)

    execution_time = time.time() - start_time
    print("huggingface model: ", execution_time)

    model = CrossEncoder("BAAI/bge-reranker-v2-m3", max_length=512)
    model.predict(pairs)

    start_time = time.time()
    for _ in range(10):
        model.predict(pairs)

    execution_time = time.time() - start_time
    print("CrossEncoder: ", execution_time)


if __name__ == '__main__':
    main()

traced model:  2.593564033508301
huggingface model:  2.4144911766052246
CrossEncoder:  2.5312209129333496

May 12 '24 19:05 frankfliu

@frankfliu I modified your script to enable it running on CUDA but got an error message

import os.path
import time

import torch
from sentence_transformers import CrossEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer


def main():
    pairs = [['what is panda?', 'hi'],
             ['what is panda?',
              'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]

    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)

    model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3').to('cuda')
    model.eval()

    input_ids = inputs["input_ids"].to('cuda')
    attention_mask = inputs["attention_mask"].to('cuda')
    inputs = inputs.to('cuda')

    model_file = "bge-reranker-v2-m3.pt"

    traced_model = torch.jit.trace(model, (input_ids, attention_mask),
                                       strict=False)
    traced_model.save(model_file)

    # warmup
    with torch.no_grad():
        traced_model(input_ids, attention_mask)

    start_time = time.time()

    with torch.no_grad():
        for _ in range(10):
            traced_model(input_ids, attention_mask)

    execution_time = time.time() - start_time
    print("traced model: ", execution_time)

    # warmup
    with torch.no_grad():
        model(**inputs)

    start_time = time.time()
    with torch.no_grad():
        for _ in range(10):
            model(**inputs)

    execution_time = time.time() - start_time
    print("huggingface model: ", execution_time)

    model = CrossEncoder("BAAI/bge-reranker-v2-m3", max_length=512, device='cpu')
    model.predict(pairs)

    start_time = time.time()
    for _ in range(10):
        model.predict(pairs)

    execution_time = time.time() - start_time
    print("CrossEncoder: ", execution_time)


if __name__ == '__main__':
    main()

RuntimeError: default_program(24): error: extra text after expected end of number
          aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
                                                                                                           ^

default_program(28): error: extra text after expected end of number
      aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
                                                                                                                     ^

2 errors detected in the compilation of "default_program".

nvrtc compilation failed: 

#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)


template<typename T>
__device__ T maximum(T a, T b) {
  return isnan(a) ? a : (a > b ? a : b);
}

template<typename T>
__device__ T minimum(T a, T b) {
  return isnan(a) ? a : (a < b ? a : b);
}

extern "C" __global__
void fused_mul_div_add(float* tattention_scores_1, float* tv_, float* aten_add, float* aten_mul) {
{
if (blockIdx.x<1ll ? 1 : 0) {
if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<86ll ? 1 : 0) {
if (blockIdx.x<1ll ? 1 : 0) {
        float v = __ldg(tv_ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
        aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
      }    }  }if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<59168ll ? 1 : 0) {
    float v_1 = __ldg(tattention_scores_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
    float v_2 = __ldg(tv_ + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) % 43ll + 43ll * (((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 29584ll));
    aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
  }}
}

Environment

OS: Linux

nvidia-cuda-runtime-cu12 12.1.105 nvidia-cudnn-cu12 8.9.2.26 torch 2.3.0

May 13 '24 01:05 zaobao

This is a known bug in torchscript in PyTorch 2.x, Please try PyTorch 1.13.1 in GPU.

You best option to run text-embedding is to convert it to onnx: see: https://github.com/deepjavalibrary/djl/blob/master/extensions/tokenizers/src/main/python/djl_converter/huggingface_converter.py#L63

May 13 '24 01:05 frankfliu

djl djl copied to clipboard

TorchScript inference slower than default torch model

Environment

djl
djl copied to clipboard