BiSeNet icon indicating copy to clipboard operation
BiSeNet copied to clipboard

TRT inference for two images does not work properly

Open syedrz opened this issue 3 years ago • 2 comments

Hi!

  • Trained BisenetV2 model and converted it .trt model successfully
  • When we infer on one image model shows correct result in infer
  • but when we send 2 images (modified the code to infer on two images)

    Here in batch inference we get correct output for first image in batch for second image we don't get inference correctly

@CoinCheung Any help will be greatly appreciated.

In convert_onnx.py we tried to change With batch size 2:

dummy_input = torch.randn(2, 3, 320, 410)
input_names = ['input_image']
output_names = ['preds',]

torch.onnx.export(net, dummy_input, args.out_pth,
    input_names=input_names, output_names=output_names,
    verbose=True, opset_version=11)

With batch_size variable:

dummy_input = torch.randn(1, 3, 320, 410)
input_names = ['input_image']
output_names = ['preds',]

torch.onnx.export(net, dummy_input, args.out_pth,
    input_names=input_names, output_names=output_names,
    verbose=False, opset_version=11, dynamic_axes={'input_image' : {0 : 'batch_size'}, 'preds' : {0 : 'batch_size'}})

The output onnx models were visualized in netron and it seemed to have been converted fine

Changes made in tensorrt files:

  1. segment.cpp:
TrtSharedEnginePtr parse_to_engine(string onnx_pth, bool use_fp16) {
    unsigned int maxBatchSize{2};
    int memory_limit = 1U << 30; // 1G

    auto builder = TrtUniquePtr<IBuilder>(nvinfer1::createInferBuilder(gLogger));
    if (!builder) {
        cout << "create builder failed\n";
        std::abort();
    }
    builder->setMaxBatchSize(maxBatchSize);
    const auto explicitBatch = 1U << static_cast<uint32_t>(
            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    auto network = TrtUniquePtr<INetworkDefinition>(
            builder->createNetworkV2(explicitBatch));
    if (!network) {
        cout << "create network failed\n";
        std::abort();
    }

    auto config = TrtUniquePtr<IBuilderConfig>(builder->createBuilderConfig());
    if (!config) {
        cout << "create builder config failed\n";
        std::abort();
    }

    auto parser = TrtUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, gLogger));
    if (!parser) {
        cout << "create parser failed\n";
        std::abort();
    }
    
    int verbosity = (int)nvinfer1::ILogger::Severity::kWARNING;
    bool state = parser->parseFromFile(onnx_pth.c_str(), verbosity);
    if (!state) {
        cout << "parse model failed\n";
        std::abort();
    }

    // IOptimizationProfile* profile = builder->createOptimizationProfile();
    // profile->setDimensions("input_image", OptProfileSelector::kMIN, Dims4(1, 3,320,410));
    // profile->setDimensions("input_image", OptProfileSelector::kOPT, Dims4(2, 3,320,410));
    // profile->setDimensions("input_image", OptProfileSelector::kMAX, Dims4(4, 3,320,410));
    // profile->setDimensions("preds", OptProfileSelector::kMIN, Dims3(1,320,410));
    // profile->setDimensions("preds", OptProfileSelector::kOPT, Dims3(2,320,410));
    // profile->setDimensions("preds", OptProfileSelector::kMAX, Dims3(4,320,410));
    
    // config->addOptimizationProfile(profile);

    config->setMaxWorkspaceSize(memory_limit*2);
    if (use_fp16 && builder->platformHasFastFp16()) {
        config->setFlag(nvinfer1::BuilderFlag::kFP16); // fp16
    }
    // TODO: see if use dla or int8

    auto output = network->getOutput(0);
    output->setType(nvinfer1::DataType::kINT32);

    TrtSharedEnginePtr engine = shared_engine_ptr(
            builder->buildEngineWithConfig(*network, *config));
    if (!engine) {
        cout << "create engine failed\n";
        std::abort();
    }

    return engine;
}

The optimization profile was added for dynamic batch size

vector<int> infer_with_engine(TrtSharedEnginePtr engine, vector<float>& data) {
    Dims3 out_dims = static_cast<Dims3&&>(
        engine->getBindingDimensions(engine->getBindingIndex("preds")));
    Dims3 in_dims = static_cast<Dims3&&>(
        engine->getBindingDimensions(engine->getBindingIndex("input_image")));

    const int batchsize{2}, H{out_dims.d[1]}, W{out_dims.d[2]}, iH{in_dims.d[2]}, iW{in_dims.d[3]};
    const int in_size{static_cast<int>(data.size())};
    const int out_size{batchsize * H * W};
    printf("Input size: %d and output size: %d \n", in_size, out_size );
    vector<void*> buffs(2);
    vector<int> res(out_size);

    auto context = TrtUniquePtr<IExecutionContext>(engine->createExecutionContext());
    if (!context) {
        cout << "create execution context failed\n";
        std::abort();
    }

    cudaError_t state;
    state = cudaMalloc(&buffs[0], in_size * sizeof(float));
    if (state) {
        cout << "allocate memory failed\n";
        std::abort();
    }
    state = cudaMalloc(&buffs[1], out_size * sizeof(int));
    if (state) {
        cout << "allocate memory failed\n";
        std::abort();
    }
    cudaStream_t stream;
    state = cudaStreamCreate(&stream);
    // context->setOptimizationProfileAsync(0, stream);
    context->setBindingDimensions(engine->getBindingIndex("input_image"), nvinfer1::Dims4(batchsize, 3, iH, iW));
    if (state) {
        cout << "create stream failed\n";
        std::abort();
    }

    state = cudaMemcpyAsync(
            buffs[0], &data[0], in_size * sizeof(float),
            cudaMemcpyHostToDevice, stream);
    if (state) {
        cout << "transmit to device failed\n";
        std::abort();
    }
    // context->enqueueV2(&buffs[0], stream, nullptr);

    context->enqueue(batchsize, &buffs[0], stream, nullptr);
    // context->enqueue(batchsize, &buffs[0], stream, nullptr);
    state = cudaMemcpyAsync(
            &res[0], buffs[1], out_size * sizeof(int), 
            cudaMemcpyDeviceToHost, stream);
    if (state) {
        cout << "transmit to host failed \n";
        std::abort();
    }
    cudaStreamSynchronize(stream);

    cudaFree(buffs[0]);
    cudaFree(buffs[1]);
    cudaStreamDestroy(stream);

    return res;
}

syedrz avatar Aug 24 '22 12:08 syedrz

If you fix batchsize=2, maybe you do not need to use dynamic batch.

CoinCheung avatar Aug 25 '22 01:08 CoinCheung

I tried with batchsize=2 The issue still persists

Would you please try TRT batch inference using two images

Thanks

syedrz avatar Aug 25 '22 10:08 syedrz