onnxruntime_backend
onnxruntime_backend copied to clipboard
How to create onnx model for ragged batching?
trafficstars
I prepeared simple example.
I created simple summing model which have input and length in shape = [-1]
import torch
import torch.nn as nn
class SummingModel(nn.Module):
def forward(self, input, lengths):
batch_size = lengths.shape[0]
sums = torch.zeros(batch_size, 1)
start = 0
for i in range(batch_size):
end = start + lengths[i]
sums[i][0] = input[start:end].sum()
start = end
return sums
model = SummingModel()
dummy_input = torch.randn(10), torch.tensor([2, 7, 10])
dynamic_axes = {
'input': {0: 'length'}, # dynamic batch size for `input`
'lengths': {0: 'length'}, # dynamic batch size for `lengths`
'output': {0: 'batch_size'}, # dynamic batch size for output
}
torch.onnx.export(model, dummy_input, "1/model.onnx", input_names = ['input', 'lengths'], output_names = ['output'], dynamic_axes=dynamic_axes)
And triton adopted this model. With config like:
name: "SummingModel"
max_batch_size: 16
platform: "onnxruntime_onnx"
input [
{
name: "input"
data_type: TYPE_FP32
dims: [ -1 ]
allow_ragged_batch: true
}
]
batch_input [
{
kind: BATCH_ACCUMULATED_ELEMENT_COUNT
target_name: "lengths"
data_type: TYPE_FP32
source_input: "input"
}
]
output [
{
name: "output"
data_type: TYPE_FP32
dims: [ -1 ]
}
]
But when i try to run this model with tritonhttpclient it throw me error
import tritonhttpclient
import numpy as np
model_name = "SummingModel"
inputs = []
for value in [2, 4, 1, 3]:
inputs.append(
[tritonhttpclient.InferInput('input', [1, value], "FP32")])
inputs[-1][0].set_data_from_numpy(
np.full([1, value], value, np.float32))
output_name = 'output'
outputs = [tritonhttpclient.InferRequestedOutput(output_name)]
async_requests = []
client = tritonhttpclient.InferenceServerClient(url="localhost:8000",
concurrency=len(inputs))
for inputs in inputs:
# Asynchronous inference call.
async_requests.append(
client.async_infer(model_name=model_name,
inputs=inputs,
outputs=outputs))
for idx in range(len(async_requests)):
# Get the result from the initiated asynchronous inference request.
# Note the call will block till the server responds.
result = async_requests[idx].get_result()
# Validate the results by comparing with precomputed values.
output_data = result.as_numpy(output_name)
print(output_data)
InferenceServerException: [400] onnx runtime error 2: Unexpected input data type. Actual: (tensor(float)) , expected: (tensor(int64)) It's also interesting why in example in documentation datatype for BATCH_ACCUMULATED_ELEMENT_COUNT is float32 not int.
Ok, after that i change input type to int64 for lengths in my config to int64 and now it not put it to triton, only put in with dtype TYPE_INT32, but torch.tensor([2, 7, 10]) have shape int64.
What i do wrong? Could you help me please?