BiSeNet TRT inference for two images does not work properly

Hi!

Trained BisenetV2 model and converted it .trt model successfully
When we infer on one image model shows correct result in infer
but when we send 2 images (modified the code to infer on two images)

Here in batch inference we get correct output for first image in batch for second image we don't get inference correctly

@CoinCheung Any help will be greatly appreciated.

In convert_onnx.py we tried to change With batch size 2:

dummy_input = torch.randn(2, 3, 320, 410)
input_names = ['input_image']
output_names = ['preds',]

torch.onnx.export(net, dummy_input, args.out_pth,
    input_names=input_names, output_names=output_names,
    verbose=True, opset_version=11)

With batch_size variable:

dummy_input = torch.randn(1, 3, 320, 410)
input_names = ['input_image']
output_names = ['preds',]

torch.onnx.export(net, dummy_input, args.out_pth,
    input_names=input_names, output_names=output_names,
    verbose=False, opset_version=11, dynamic_axes={'input_image' : {0 : 'batch_size'}, 'preds' : {0 : 'batch_size'}})

The output onnx models were visualized in netron and it seemed to have been converted fine

Changes made in tensorrt files:

segment.cpp:

TrtSharedEnginePtr parse_to_engine(string onnx_pth, bool use_fp16) {
    unsigned int maxBatchSize{2};
    int memory_limit = 1U << 30; // 1G

    auto builder = TrtUniquePtr<IBuilder>(nvinfer1::createInferBuilder(gLogger));
    if (!builder) {
        cout << "create builder failed\n";
        std::abort();
    }
    builder->setMaxBatchSize(maxBatchSize);
    const auto explicitBatch = 1U << static_cast<uint32_t>(
            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    auto network = TrtUniquePtr<INetworkDefinition>(
            builder->createNetworkV2(explicitBatch));
    if (!network) {
        cout << "create network failed\n";
        std::abort();
    }

    auto config = TrtUniquePtr<IBuilderConfig>(builder->createBuilderConfig());
    if (!config) {
        cout << "create builder config failed\n";
        std::abort();
    }

    auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, gLogger));
    if (!parser) {
        cout << "create parser failed\n";
        std::abort();
    }
    
    int verbosity = (int)nvinfer1::ILogger::Severity::kWARNING;
    bool state = parser->parseFromFile(onnx_pth.c_str(), verbosity);
    if (!state) {
        cout << "parse model failed\n";
        std::abort();
    }

    // IOptimizationProfile* profile = builder->createOptimizationProfile();
    // profile->setDimensions("input_image", OptProfileSelector::kMIN, Dims4(1, 3,320,410));
    // profile->setDimensions("input_image", OptProfileSelector::kOPT, Dims4(2, 3,320,410));
    // profile->setDimensions("input_image", OptProfileSelector::kMAX, Dims4(4, 3,320,410));
    // profile->setDimensions("preds", OptProfileSelector::kMIN, Dims3(1,320,410));
    // profile->setDimensions("preds", OptProfileSelector::kOPT, Dims3(2,320,410));
    // profile->setDimensions("preds", OptProfileSelector::kMAX, Dims3(4,320,410));
    
    // config->addOptimizationProfile(profile);

    config->setMaxWorkspaceSize(memory_limit*2);
    if (use_fp16 && builder->platformHasFastFp16()) {
        config->setFlag(nvinfer1::BuilderFlag::kFP16); // fp16
    }
    // TODO: see if use dla or int8

    auto output = network->getOutput(0);
    output->setType(nvinfer1::DataType::kINT32);

    TrtSharedEnginePtr engine = shared_engine_ptr(
            builder->buildEngineWithConfig(*network, *config));
    if (!engine) {
        cout << "create engine failed\n";
        std::abort();
    }

    return engine;
}

The optimization profile was added for dynamic batch size

vector<int> infer_with_engine(TrtSharedEnginePtr engine, vector<float>& data) {
    Dims3 out_dims = static_cast<Dims3&&>(
        engine->getBindingDimensions(engine->getBindingIndex("preds")));
    Dims3 in_dims = static_cast<Dims3&&>(
        engine->getBindingDimensions(engine->getBindingIndex("input_image")));

    const int batchsize{2}, H{out_dims.d[1]}, W{out_dims.d[2]}, iH{in_dims.d[2]}, iW{in_dims.d[3]};
    const int in_size{static_cast<int>(data.size())};
    const int out_size{batchsize * H * W};
    printf("Input size: %d and output size: %d \n", in_size, out_size );
    vector<void*> buffs(2);
    vector<int> res(out_size);

    auto context = TrtUniquePtr<IExecutionContext>(engine->createExecutionContext());
    if (!context) {
        cout << "create execution context failed\n";
        std::abort();
    }

    cudaError_t state;
    state = cudaMalloc(&buffs[0], in_size * sizeof(float));
    if (state) {
        cout << "allocate memory failed\n";
        std::abort();
    }
    state = cudaMalloc(&buffs[1], out_size * sizeof(int));
    if (state) {
        cout << "allocate memory failed\n";
        std::abort();
    }
    cudaStream_t stream;
    state = cudaStreamCreate(&stream);
    // context->setOptimizationProfileAsync(0, stream);
    context->setBindingDimensions(engine->getBindingIndex("input_image"), nvinfer1::Dims4(batchsize, 3, iH, iW));
    if (state) {
        cout << "create stream failed\n";
        std::abort();
    }

    state = cudaMemcpyAsync(
            buffs[0], &data[0], in_size * sizeof(float),
            cudaMemcpyHostToDevice, stream);
    if (state) {
        cout << "transmit to device failed\n";
        std::abort();
    }
    // context->enqueueV2(&buffs[0], stream, nullptr);

    context->enqueue(batchsize, &buffs[0], stream, nullptr);
    // context->enqueue(batchsize, &buffs[0], stream, nullptr);
    state = cudaMemcpyAsync(
            &res[0], buffs[1], out_size * sizeof(int), 
            cudaMemcpyDeviceToHost, stream);
    if (state) {
        cout << "transmit to host failed \n";
        std::abort();
    }
    cudaStreamSynchronize(stream);

    cudaFree(buffs[0]);
    cudaFree(buffs[1]);
    cudaStreamDestroy(stream);

    return res;
}

Aug 24 '22 12:08 syedrz

If you fix batchsize=2, maybe you do not need to use dynamic batch.

Aug 25 '22 01:08 CoinCheung

I tried with batchsize=2 The issue still persists

Would you please try TRT batch inference using two images

Thanks

Aug 25 '22 10:08 syedrz