[QUESTION] How to use NMS operator to process yolov8 output?
Hi!
Having hard time figuring out how to properly process yolov8 output with Non-Maximum Supression operator
I've got yolo output in device memory, how would I go about creating required tensors for NMS operator? Also, without copying memory from device to host for data processing and back to device again for NMS operator
As far as I understand, yolo output is this for batch size of 1, data type is f32: [x, x, x... x] [y, y, y... y] [w, w, w... w] [h, h, h... h] [cls,cls,cls... cls]
And NMS operator expects it in this format for batch size of 1, data type is int16: [x,y,w,h...x,y,w,h]
I've only came up with a solution to copy device memory to host, create array of 4 component int16 vectors, copy them to device memory and pass to the operator, which sounds really bad
This is what I've currently got:
struct rect
{
int16_t x,y,w,h;
};
std::vector<float> x(outputWidth);
std::vector<float> y(outputWidth);
std::vector<float> w(outputWidth);
std::vector<float> h(outputWidth);
std::vector<float> s(outputWidth);
auto ptr = frame->outputLayer->basePtr();
cudaStream_t stream;
cudaStreamCreate(&stream);
CHECK_CUDA_ERROR(cudaMemcpyAsync(x.data(), ptr, outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA_ERROR(cudaMemcpyAsync(y.data(), ptr + (outputWidth * 1), outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA_ERROR(cudaMemcpyAsync(w.data(), ptr + (outputWidth * 2), outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA_ERROR(cudaMemcpyAsync(h.data(), ptr + (outputWidth * 3), outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA_ERROR(cudaMemcpyAsync(s.data(), ptr + (outputWidth*4), outputWidth * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
std::vector<rect> boxes(outputWidth);
for (int i = 0; i < outputWidth; i++)
{
boxes.push_back({
static_cast<int16_t>(((x[i] - 0.5 * w[i]) * xFactor)),
static_cast<int16_t>(((x[i] - 0.5 * h[i]) * yFactor)),
static_cast<int16_t>(w[i] * xFactor),
static_cast<int16_t>(h[i] * yFactor)
});
}
frame->outputLayer is nvcv::Optional<nvcv::TensorDataStridedCuda>, which is instantiated as
nvcv::Tensor::Requirements reqsOutputLayer = nvcv::Tensor::CalcRequirements(1, { model.output.dims[2], model.output.dims[1] }, nvcv::FMT_RGBf32p);
nvcv::TensorDataStridedCuda::Buffer bufOutputLayer;
std::copy(reqsOutputLayer.strides, reqsOutputLayer.strides + NVCV_TENSOR_MAX_RANK, bufOutputLayer.strides);
CHECK_CUDA_ERROR(cudaMalloc((void**)&bufOutputLayer.basePtr, CalcTotalSizeBytes(nvcv::Requirements{reqsOutputLayer.mem}.cudaMem())));
nvcv::TensorDataStridedCuda outputLayerTensorData(nvcv::TensorShape{reqsOutputLayer.shape, reqsOutputLayer.rank, reqsOutputLayer.layout}, nvcv::DataType{reqsOutputLayer.dtype}, bufOutputLayer);
nvcv::Tensor outputLayerTensor = TensorWrapData(outputLayerTensorData);
and passed to TensorRT inference
context->executeV2(new void*[]{ frame->inputLayer->basePtr(), outputLayerTensor.exportData<nvcv::TensorDataStridedCuda>()->basePtr() });