BentoML No better performance for concurring requests using batching

Discussed in https://github.com/bentoml/BentoML/discussions/3137

^{Originally posted by katerinafrid October 24, 2022} I'm trying to speed up the processing of requests to my pytorch model, but I do not see any improvements compared to the standard sequential processing. Am I doing something wrong? First, I execute utils.py file, then I start a server by running server.py.

utils.py

class NLUPipeline(transformers.Pipeline):
    def preprocess(self, inputs):
        return self.tokenizer(inputs['premise_list'], inputs['hypothesis_list'], truncation='only_first',
                              max_length=128, padding=True, return_tensors='pt').to(self.device)

    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def _forward(self, model_inputs):
        model_out = self.model(**model_inputs)
        return model_out.logits

    @classmethod
    def postprocess(cls, model_outputs):
        return model_outputs


class NLURunnable(bentoml.Runnable):
    SUPPORT_NVIDIA_GPU = True
    SUPPORTED_RESOURCES = ('cuda' if torch.cuda.is_available() else 'cpu',)
    SUPPORTS_CPU_MULTI_THREADING = True

    def __init__(self):
        # load the model instance
        self.nlu_model = bentoml.transformers.load_model("nlu_pl:latest",  device=0 if torch.cuda.is_available() else -1)

    @bentoml.Runnable.method(batchable=True, batch_dim=0)
    def predict(self, input_data):
        return self.nlu_model(input_data)





def register_nlu_pipeline():
    TASK_NAME = "zero-shot-classification"
    TASK_DEFINITION = {
        "impl": NLUPipeline,
        "tf": (),
        "pt": (transformers.AutoModelForSequenceClassification,),
        "default": {},
        "type": "text",
    }
    SUPPORTED_TASKS[TASK_NAME] = TASK_DEFINITION

def create_nlu_pipeline(nlu_model_path: str = default_nlu_model_path):
    classifier = transformers.pipeline(
        task="zero-shot-classification",
        model=transformers.AutoModelForSequenceClassification.from_pretrained(
            nlu_model_path
        ),
        tokenizer=transformers.AutoTokenizer.from_pretrained(
            nlu_model_path
        ),
    )
    return classifier

nlu_pl = create_nlu_pipeline('path')
bentoml.transformers.save_model(
       'nlu_pl',
        pipeline=nlu_pl,
        signatures={
            "predict": {
                "batchable": True,
                "batch_dim": 0,
            },
        },
    )

server.py

nlu_model = bentoml.transformers.get("nlu_pl:latest")
nlu_runner = bentoml.Runner(NLURunnable, models=[nlu_model],
                            method_configs={"predict": {"max_batch_size": 16, "max_latency_ms": 600}}
                            )
register_nlu_pipeline()
svc = bentoml.Service("server", runners=[nlu_runner])


class NLURequest(BaseModel):
    premise_list: List[str]
    hypothesis_list: List[str]

@svc.api(input=JSON(pydantic_model=NLURequest), output=JSON())
async def nlu_request(json: NLURequest) -> Dict[str, Any]:
    req_body = {"premise_list": json.premise_list, "hypothesis_list": json.hypothesis_list}
    response = await nlu_runner.predict.async_run(req_body)
    return {"result": response.cpu().numpy()}

Oct 25 '22 19:10 katerinafrid

I am experiencing error with batching enabled, 2022-11-08T13:07:08+0000 [INFO] [api_server:1] 127.0.0.1:33304 (scheme=http,method=POST,path=/v1/get_intents,type=application/json,length=91) (status=200,type=application/json,length=20) 1450.984ms (trace=f8472c38b374f57a7213989491a40acc,span=c32005903caa5b5f,sampled=0) Traceback (most recent call last): File "/workspace/personality_framework/personality_service/bento_service.py", line 238, in get_intent result=await runner1.is_positive.async_run([{"sentence":query}]) File "/tmp/e2/lib/python3.8/site-packages/bentoml/_internal/runner/runner.py", line 53, in async_run return await self.runner._runner_handle.async_run_method( # type: ignore File "/tmp/e2/lib/python3.8/site-packages/bentoml/_internal/runner/runner_handle/remote.py", line 207, in async_run_method raise ServiceUnavailable(body.decode()) from None

without batching the same code works well ,

my batching configuration is enabled: true max_batch_size: 100 max_latency_ms: 1000

without batching with load testing i get reply to my 100 simultaneous requests without error , with batching im facing the above error

Nov 08 '22 13:11 pi2cto

Can you try on the latest bentoml version, with the new service API?

Jul 11 '24 01:07 frostming