BiSeNet
BiSeNet copied to clipboard
TRT inference for two images does not work properly
Hi!
- Trained BisenetV2 model and converted it .trt model successfully
- When we infer on one image model shows correct result in infer
- but when we send 2 images (modified the code to infer on two images)
Here in batch inference we get correct output for first image in batch for second image we don't get inference correctly
@CoinCheung Any help will be greatly appreciated.
In convert_onnx.py we tried to change With batch size 2:
dummy_input = torch.randn(2, 3, 320, 410)
input_names = ['input_image']
output_names = ['preds',]
torch.onnx.export(net, dummy_input, args.out_pth,
input_names=input_names, output_names=output_names,
verbose=True, opset_version=11)
With batch_size variable:
dummy_input = torch.randn(1, 3, 320, 410)
input_names = ['input_image']
output_names = ['preds',]
torch.onnx.export(net, dummy_input, args.out_pth,
input_names=input_names, output_names=output_names,
verbose=False, opset_version=11, dynamic_axes={'input_image' : {0 : 'batch_size'}, 'preds' : {0 : 'batch_size'}})
The output onnx models were visualized in netron and it seemed to have been converted fine
Changes made in tensorrt files:
- segment.cpp:
TrtSharedEnginePtr parse_to_engine(string onnx_pth, bool use_fp16) {
unsigned int maxBatchSize{2};
int memory_limit = 1U << 30; // 1G
auto builder = TrtUniquePtr<IBuilder>(nvinfer1::createInferBuilder(gLogger));
if (!builder) {
cout << "create builder failed\n";
std::abort();
}
builder->setMaxBatchSize(maxBatchSize);
const auto explicitBatch = 1U << static_cast<uint32_t>(
nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
auto network = TrtUniquePtr<INetworkDefinition>(
builder->createNetworkV2(explicitBatch));
if (!network) {
cout << "create network failed\n";
std::abort();
}
auto config = TrtUniquePtr<IBuilderConfig>(builder->createBuilderConfig());
if (!config) {
cout << "create builder config failed\n";
std::abort();
}
auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, gLogger));
if (!parser) {
cout << "create parser failed\n";
std::abort();
}
int verbosity = (int)nvinfer1::ILogger::Severity::kWARNING;
bool state = parser->parseFromFile(onnx_pth.c_str(), verbosity);
if (!state) {
cout << "parse model failed\n";
std::abort();
}
// IOptimizationProfile* profile = builder->createOptimizationProfile();
// profile->setDimensions("input_image", OptProfileSelector::kMIN, Dims4(1, 3,320,410));
// profile->setDimensions("input_image", OptProfileSelector::kOPT, Dims4(2, 3,320,410));
// profile->setDimensions("input_image", OptProfileSelector::kMAX, Dims4(4, 3,320,410));
// profile->setDimensions("preds", OptProfileSelector::kMIN, Dims3(1,320,410));
// profile->setDimensions("preds", OptProfileSelector::kOPT, Dims3(2,320,410));
// profile->setDimensions("preds", OptProfileSelector::kMAX, Dims3(4,320,410));
// config->addOptimizationProfile(profile);
config->setMaxWorkspaceSize(memory_limit*2);
if (use_fp16 && builder->platformHasFastFp16()) {
config->setFlag(nvinfer1::BuilderFlag::kFP16); // fp16
}
// TODO: see if use dla or int8
auto output = network->getOutput(0);
output->setType(nvinfer1::DataType::kINT32);
TrtSharedEnginePtr engine = shared_engine_ptr(
builder->buildEngineWithConfig(*network, *config));
if (!engine) {
cout << "create engine failed\n";
std::abort();
}
return engine;
}
The optimization profile was added for dynamic batch size
vector<int> infer_with_engine(TrtSharedEnginePtr engine, vector<float>& data) {
Dims3 out_dims = static_cast<Dims3&&>(
engine->getBindingDimensions(engine->getBindingIndex("preds")));
Dims3 in_dims = static_cast<Dims3&&>(
engine->getBindingDimensions(engine->getBindingIndex("input_image")));
const int batchsize{2}, H{out_dims.d[1]}, W{out_dims.d[2]}, iH{in_dims.d[2]}, iW{in_dims.d[3]};
const int in_size{static_cast<int>(data.size())};
const int out_size{batchsize * H * W};
printf("Input size: %d and output size: %d \n", in_size, out_size );
vector<void*> buffs(2);
vector<int> res(out_size);
auto context = TrtUniquePtr<IExecutionContext>(engine->createExecutionContext());
if (!context) {
cout << "create execution context failed\n";
std::abort();
}
cudaError_t state;
state = cudaMalloc(&buffs[0], in_size * sizeof(float));
if (state) {
cout << "allocate memory failed\n";
std::abort();
}
state = cudaMalloc(&buffs[1], out_size * sizeof(int));
if (state) {
cout << "allocate memory failed\n";
std::abort();
}
cudaStream_t stream;
state = cudaStreamCreate(&stream);
// context->setOptimizationProfileAsync(0, stream);
context->setBindingDimensions(engine->getBindingIndex("input_image"), nvinfer1::Dims4(batchsize, 3, iH, iW));
if (state) {
cout << "create stream failed\n";
std::abort();
}
state = cudaMemcpyAsync(
buffs[0], &data[0], in_size * sizeof(float),
cudaMemcpyHostToDevice, stream);
if (state) {
cout << "transmit to device failed\n";
std::abort();
}
// context->enqueueV2(&buffs[0], stream, nullptr);
context->enqueue(batchsize, &buffs[0], stream, nullptr);
// context->enqueue(batchsize, &buffs[0], stream, nullptr);
state = cudaMemcpyAsync(
&res[0], buffs[1], out_size * sizeof(int),
cudaMemcpyDeviceToHost, stream);
if (state) {
cout << "transmit to host failed \n";
std::abort();
}
cudaStreamSynchronize(stream);
cudaFree(buffs[0]);
cudaFree(buffs[1]);
cudaStreamDestroy(stream);
return res;
}
If you fix batchsize=2, maybe you do not need to use dynamic batch.
I tried with batchsize=2 The issue still persists
Would you please try TRT batch inference using two images
Thanks